Python Snippet
1. Setup environment
1.1. Using conda
1.1.1. Create conda environment
- Create env by common command line
conda create -n python38 python=3.8.5 pip=20.2.4 ipykernel notebook
conda activate python38
- Create env use
environment.yaml
file
name: env_ascore
channels:
- conda-forge
- defaults
dependencies:
- python=3.8
- pandas==1.4.4
- joblib==1.1.0
- statsmodels==0.13.2
- ipykernel
- zipp
- pip
- pip:
- optbinning==0.17.3
- ortools==9.4.1874
# conda env create -f environment.yaml
# conda env remove -n env_ascore
# set https_proxy=10.1.33.23:8080
# set http_proxy=10.1.33.23:8080
# conda install -n env_ascore ipykernel --update-deps --force-reinstall
- Create env use
requirements.txt
file
conda list --export > requirements.txt
conda install --file requirements.txt
1.1.2. Conda common commands
- Conda environment list
conda info --env
- Remove conda environment
conda deactivate
conda env remove -n python38
- Activate conda environment
conda activate python38
- Clean unused library
conda clean --all
pip cache remove *
1.2. Create environment for jupyter notebook
1.2.1. create
conda activate python38
ipython kernel install --user --name=python38
1.2.2. Remove jupyter notebook environment (require run as administrator)
jupyter kernelspec list
jupyter kernelspec uninstall python38
2. Install packages
2.1. Install OFFLINE packages using requirements.txt
Step 1: Input to
requirements.txt
file in current directory. Eg. content"jupyter-contrib-nbextensions==0.5.1"
Step 2: Create folder
wheel
(Eg. D:)Step 3: Run following command to download dependencies packages to folder
wheel
pip download -r requirements.txt -d wheel
- Step 4: Run following command to install
pip install -r requirements.txt --find-links=D:\wheel --no-index
2.2. Install OFFLINE Linux package
Activate same version python (i.e 3.7.0) and type command following
pip download --platform manylinux1_x86_64 --only-binary=:all: --no-binary=:none: pandas
2.3. Export requirements
pip list --format=freeze > requirements.txt
3. Install Extension for jupyter notebook
- nbextension
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter contrib nbextension install --user
jupyter nbextensions_configurator enable --user
4. Other ultilities command
check dependencies
python -m pip check
pip freeze > requirements.txt
install pycaret
pip install pycaret --use-feature=2020-resolver
Themes
- jupyter notebook
jt -t onedork -fs 13 -altp -tfs 14 -nfs 14 -cellw 88% -T
5. Pandas
- case when
'new_column'] = np.where(df['col2']<9, 'value1',
df['col2']<12, 'value2',
np.where(df['col2']<15, 'value3', 'value4'))) np.where(df[
- clean names
= X.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"})) X.columns
- rename
= df.columns.str.lower() df.columns
- binning
'Cat Age'] = pd.cut(x=df['Age'], bins=[0, 25, 30, 35, 40, 45, 50, 75]) df[
- groupby stratify
'target', group_keys=False).apply(lambda x: x.sample(frac=0.8)) df.groupby(
- groupby partition (transform)
'new'] = df.groupby('group_var')['value_var'].transform('mean')
df['new'] = df.groupby('group_var')['value_var'].transform(lambda x: some function) df[
- read data
= []
appended_data for infile in glob.glob("*.xlsx"):
= pandas.read_excel(infile)
data # store DataFrame in list
appended_data.append(data)
# Use pd.concat to merge a list of DataFrame into a single big DataFrame.
= pd.concat(appended_data)
appended_data = [df.set_index('ID') for df in appended_data]
appended_data = pd.concat(appended_data) appended_data
- filter
~df['name'].isin(list1)]
df['name'].isna()]
df[df['name'].notna()]
df[df[
#filter for rows where team name is not in one of several columns
~df[['star_team', 'backup_team']].isin(values_list).any(axis=1)] df[
- convert
# pd.to_numeric
# pd.factorize
# df['STRING'].astype(str)
# pd.to_datetime(df['DATE'], format='%d%m%Y')
- Excel date
import functools as ft
= ft.partial(xlrd.xldate_as_datetime, datemode=0)
fn_convert_date 'DATE'].apply(fn_convert_date).dt.year df[
- aggregate
= df.groupby('ID').agg('first') df
def fn_describe(df):
= df.describe(include='all')
tbl 'dtype'] = df.dtypes
tbl.loc['total'] = len(df)
tbl.loc['%null'] = df.isnull().mean() * 100
tbl.loc['nbr.zero'] = (df == 0).sum(axis=1)
tbl.loc['%zero'] = (df == 0).sum(axis=1)/len(dat)
tbl.loc[= tbl.T
tbl 'nbr.null'] = tbl['total'] - tbl['count']
tbl['nbr.top'] = tbl['freq']
tbl['%top'] = tbl['freq']/tbl['total']*100
tbl[= tbl[['dtype', 'total', 'nbr.null', '%null', 'nbr.zero', '%zero', 'mean', 'std', 'min', 'max', 'unique', 'top', 'nbr.top', '%top']]
tbl return tbl
- date
# difference of month
'nb_months'] = ((df.dates1 - df.dates2)/np.timedelta64(1, 'M'))
df['nb_months'] = df['nb_months'].astype(int)
df[# difference of day
'Number_of_days'] = ((df.dates1 - df.dates2)/np.timedelta64(1, 'D'))
df['Number_of_days'] = df['Number_of_days'].astype(int) df[
- mapping
= dict(zip(df[col],df['temp2']))
mapping = temp_df[col].map(mapping) temp_df[col]
- Apply to all values except missing
apply(lambda a: a+2 if pd.notnull(a) else a) s.
- fillna
'MOBILE'] = dat['MOBILE'].fillna(dat['MOBILE2']) dat[
- to_excel
=True, drop=True)
pred_train.reset_index(inplace=True, drop=True)
pred_test.reset_index(inplacewith pd.ExcelWriter(product['t_pred']) as writer:
='pred_train')
pred_train.to_excel(writer, sheet_name='pred_test') pred_test.to_excel(writer, sheet_name
6. Join
reduce merge
pandas.concat
= [df0, df1, df2, ..., dfN]
dfs # require set_index, not using key
= [df.set_index('name') for df in dfs]
dfs # can't not run if index not unique
= pd.concat(dfs, join='outer', axis = 1) dfs
functools.reduce
# still run with index not unique
import functools as ft
= ft.reduce(lambda left, right: pd.merge(left, right, on='name', how = 'outer'), dfs) dfs
join
# cant not run if index not unique
= [df.set_index('name') for df in dfs]
dfs 0].join(dfs[1:], how = 'outer') dfs[
7. Files and folder
= glob.glob(".data/*.xlsx") list_files
8. List
Delete multiple elements from list python
= [1, 3]
unwanted for ele in sorted(unwanted, reverse = True):
del list1[ele]
9. Regex
import re
# Validate number
= "^\\d+$"
number_pattern '42') # Returns Match object
re.match(number_pattern, 'notanumber') # Returns None
re.match(number_pattern,
# Extract number from a string
= "\\d+"
number_extract_pattern 'Your message was viewed 203 times.') # returns ['203'] re.findall(number_extract_pattern,
10. Database
Connect to database
import pandas as pd
import pyodbc
import sqlalchemy as sa
import urllib
from sqlalchemy import create_engine, event
from sqlalchemy.engine.url import URL
= 'VM-DC-JUMPSRV77\\IFRS9'
server = 'DATA'
database = 'username'
username = '@@@@@@'
password
= urllib.parse.quote_plus("DRIVER={SQL Server};"
params "SERVER="+server+";"
"DATABASE="+database+";"
"UID="+username+";"
"PWD="+password+";")
= sa.create_engine("mssql+pyodbc:///?odbc_connect={}".format(params)) engine
Read/push table to database
# read data
= pd.read_sql('SELECT top 5 * FROM b_tmp_cl020', engine)
df_test # push data
'binh_test', engine, if_exists = 'append') df_test.to_sql(
sqlite3
= sqlite3.connect("data/mydata.sqlite")
con
= con.cursor()
cur
# The result of a "cursor.execute" can be iterated over by row
for row in cur.execute('SELECT * FROM tbl;'):
print(row)
# Be sure to close the connection
con.close()
11. Writing shorthand statements
- lambda
# lambda statement
= lambda a: a+3
foo 10)
foo(
# Self called Lambda
lambda a: a+3)(8) (
- List Comprehension
for i in range(10) if i%2==0] [i
- Dict Comprehension
= ['MANGO', 'APPLE', 'ORANGE']
mylist ={i.lower(): 2 for i in mylist} d
- String Concatenation with delimiter
"_".join(["how", "are", "you"])
12. API
JSON file
# Open JSON file
try:
with open ('data/sample.json', "r") as f:
= json.load(f)
data except IOError:
print('cannot open file')
# save
with open('data/sample.json', 'w') as f:
json.dump(data, f)
99. Equivalent R
functools
~purrr