Python cheatsheets for machine learning

2022-09-18 2078 words 10 minutes

/posts/python-cheat-sheet/featured-image.jpg

Contents

A list of my Python cheatsheets for machine learning and more

Native Python

from datetime import datetime
from math import log, sqrt, sin, pi, e, factorial

# useful commands
"""
python --version
python -m venv ./venv
source ./venv/bin/activate
pip install -r requirements.txt
"""

# Containers, types
li: list[int] = [1, 2]
tu: tuple[int, int] = (1, 2)
se: set[int] = {1, 2}
di: dict[str, int] = {"a": 1, "b": 2}

# multiple assignment, unpacking and unpacking operator *
start, stop, step = 1, 6, 2
one, two, three = range(3)  # unpacking works with tuples and iterable types
*six_to_height, nine = [6, 7, 8, 9]  # six_to_height == [6, 7, 8]

# list comprehension (replace map->filter pattern), range
list_comp = [even for even in range(start, stop, step) if even != 3]  # [1, 5]
dict_comp = {str(i): i for i in range(stop)}  # {"0": 0, "1": 1, ..., "5": 5}

# list indexing and operations
l = [0, 1, 2, 3, 4, 5, 6]
last, before_last = l[-1], l[-2]  # 6, 5
slice_step = l[start:stop:step]  # [1, 3, 5]
slice_defaults_ok = l[::] == l and l[:] == l and l[0:len(l):1] == l
l_min, l_len, l_sort, all_true, any_true = min(l), len(l), sorted(l), all(l), any(l)
zipped_l_and_rev: list[tuple[int, int]] = list(zip(l, reversed(l)))  # [(0,6), (1,5)...(6,0)]
l.sort(key=lambda x: abs(x))  # inplace, useful to sort on a specific field
doesnt_contains_three = 3 not in l  # "in" operator call __contains__() internally
l.append(7)

# dict operations
keys, values, pairs = di.keys(), di.values(), di.items()
keys_arr, paris_arr = [k for k in di], [(k, v) for k, v in di.items()]
none_val = di.get("absent")  # return None if absent, [] raise KeyError

# maths
log_exp, sqrt_sqr, sin_pi = log(e**2), sqrt(7**2), sin(pi)

# files, "with" statement (try..finally pattern)
with open("file.txt", "r") as f:  # r: read, w: write, a: append
    # call one of those 3 will move the file cursor to the end
    all_file: str = f.read()
    all_lines: list[str] = f.readlines()
    line_by_line: list[str] = [line for line in f]

# string
f_string_ten_is_ten = f"ten is {2*5}"  # string interpolation
multi_line_str_zen_python = """
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
"""

# ternary operator
yes = "yes" if "ten" in f_string_ten_is_ten else "no"

# exceptions, input, output, cast, isinstance
try:
    input_str: str = input("enter a number")
    input_int: int = int(input_str)
    is_int = isinstance(input_int, int)
except ValueError:
    print("error")

# enumerate over iterator with index (lazy generator)
for idx, elt in enumerate(l):
    print(f'elt at {idx} is {elt}')

# Assignment expression (walrus operator) :=
# useful to both check and use a variable inside a scope
positive_logs = [log_e for e in l if (log_e := log(e)) > 0]

# Generators and generator expressions
gen_expr = (factorial(i) for i in range(1, 10))
def gen_factorials(n: int):
    curr = 1
    for i in range(1,n):
        curr *= i
        yield i, curr


# Class, Interface and inheritance
class MyInterface:
    """
        class docstring
    """

    def do(self, i: int):
        """
        function docstring
        :param i:
        :return:
        """
        raise NotImplementedError


class MyImpl(MyInterface):
    def __init__(self):
        super().__init__()

    def do(self, i: int):
        pass


# functions
# static function, private function
class MyClass:
    @staticmethod
    def my_static_method():
        pass

    def __private_func(self):
        pass

# multiple return values (tuple), closure (inner function)
def fctReturnTuple() -> tuple[int,int]:
    def helper():
        return 0, 1
    return helper()

a, b = fctReturnTuple()

# variable positional arguments (*varargs)
# keyword arguments (**kwargs)
def my_fct_all_args(known_positional_arg: str, *varargs, known_named_arg: str, **kwargs):
    print(known_positional_arg)
    print(varargs)
    print(known_named_arg)
    print(kwargs)


# Both calls are equivalents and return:
#   known_positional_arg
#   ('vararg1', 'vararg2')
#   known_named_arg
#   {'kwargs1': 'kwargs1', 'kwargs2': 'kwargs2'}
my_fct_all_args(
    "known_positional_arg",
    "vararg1", "vararg2",
    known_named_arg="known_named_arg",
    kwargs1="kwargs1", kwargs2="kwargs2"
)
my_fct_all_args(
    "known_positional_arg",
    *["vararg1", "vararg2"],
    **{"kwargs1": "kwargs1", "kwargs2": "kwargs2"}
)

# tests, mocks
from unittest import TestCase, main
from unittest.mock import Mock
class MyTestedClassTestCase(TestCase):

    def setUp(self):
        # prepare tests
        pass

    def tearDown(self):
        # clean
        pass

    def test_my_tested_func(self):
        now_func = Mock(spec=datetime.utcnow)
        now_func.return_value = datetime(2019, 6, 5, 15, 45)
        self.assertNotEqual(now_func(), datetime(2019,1,1))

# to execute current test file
if __name__ == '__main__':
    main()

Numpy

import numpy as np

# --- Initialization
from_python_array = np.array([[1, 2, 3], [4, 5, 6]])  # [[1 2 3], [4 5 6]]
zeros = np.zeros((2, 3), dtype=float)  # [[0. 0. 0.], [0. 0. 0.]], "float" is the default
ones = np.ones((2, 3))  # [[1. 1. 1.], [1. 1. 1.]]
memory_not_initialized = np.empty((2, 3))
# 'like' init functions work also with zeros_like, empty_like...
same_shape_as_model = np.zeros_like(from_python_array)  # [[0. 0. 0.], [0. 0. 0.]]
step_by_step = np.arange(2, 9, 2)  # [2, 4, 6, 8], (min, max, step) like python "range"
linear_space = np.linspace(0, 10, 5)  # (min, max, nb_points) [ 0., 2.5, 5., 7.5, 10.]
identity = np.eye(3)  # identity matrix of shape (3, 3)
lower_triangular = np.tri(3)  # triangular matrix of shape (3, 3)

# --- sort
arr = np.array([[1.0, 3.0], [20.0, 10.0]])
arr.sort()  # inplace
sorted_copy = np.sort(arr, axis=-1)  # sort along specified axis, here last (-1) axis

# --- see array characteristics
nb_dimensions = arr.ndim
total_nb_elts = arr.size
tuple_of_size_by_dim = arr.shape

# --- reshape
# -1 to let numpy infer the size on this dim (only one), here it gives a (4, 1, 1) tensor
reshaped = np.reshape(arr, newshape=(-1, 1, 1))

# --- squeeze and expand (wrap, unwrap)
# add dim on specified index: [[1. 3.], [10. 20.]] -> [ [[1. 3.]], [[10. 20.]] ]
expanded = np.expand_dims(arr, axis=1)
# remove specified dims (all by default) where size is 1
# here we remove dim one added by expand_dims -> [[1. 3.], [10. 20.]]
squeezed = np.squeeze(expanded, axis=1)

# --- indexing and slicing
# NB: those operations don't create a new array in memory but just a view
arr = np.array([[11., 12., 13.], [21., 22., 23.], [31., 32., 33.]])

# basic slicing, like python
by_index = arr[1, 2]  # 23
by_slice = arr[0, 1:3:1]  # [12, 13]

# fancy indexing, specific to numpy
by_list_of_indexes = arr[:, [0, 2]]  # [[11. 13.], [21. 23.], [31. 33.]]

# boolean indexing, specific to numpy (use broadcasting)
bool_arr = arr > 21  # [[False False False], [ False  True  True], [ True  True  True]]
flat_when_filtered_on_condition_of_same_shape = arr[arr > 21]  # [22. 23. 31. 32. 33.]
filtered_by_dim = arr[arr.sum(-1) > 40, :]  # [[21. 22. 23.], [31. 32. 33.]]

# assigning with indexing, specific to numpy
modified = np.copy(arr)
modified[:, :] = 1.0  # [[1. 1. 1.], [1. 1. 1.], [1. 1. 1.]]

# --- concat
# concatenate array args must have the same number of dims
row_to_concat = np.expand_dims([41, 42, 43], 0)  # [[41 42 43]]
col_to_concat = np.expand_dims([14, 24, 34], -1)  # [[14] [24] [34]]
with_forth_row = np.concatenate((arr, row_to_concat), axis=0)
with_forth_col = np.concatenate((arr, col_to_concat), axis=-1)

# --- operations
# term by term
term_by_term = (arr + arr - arr) * arr / arr
# usual aggregations
min, sum = arr.min(), arr.sum(axis=-1)  # 11, [36. 66. 96.]
# matrix operations
# more complex linear algebra operations are better dealt with SciPy
transposed = arr.T
dot_product = arr @ arr

# --- broadcasting
# dimensions are compared from right (inner) to left (outer) dimension
# 2 dims are compatibles if they are equals or one is of size 1
broadcast_single_number = arr * 2
broadcast_vec = arr * [1, 10, 100]  # [[11. 120. 1300.], [21. 220. 2300.], [31. 320. 3300.]]
# broadcast (2, 3, 3) with (3, 3), term by term op applied to all (3, 3) inner matrices
wrapped_arr = np.expand_dims(arr, axis=0)
concat_wrapped_arr = np.concatenate((wrapped_arr, wrapped_arr), axis=0)
broadcast_matrix = concat_wrapped_arr * [[1, 10, 100], [2, 20, 200], [3, 30, 300]]
# [[[  11.  120. 1300.]
#   [  42.  440. 4600.]
#   [  93.  960. 9900.]]
#
#  [[  11.  120. 1300.]
#   [  42.  440. 4600.]
#   [  93.  960. 9900.]]]

Pytorch

This is a minimal working example of a PyTorch project

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch import nn
from torch.optim import Optimizer
from torch.optim.lr_scheduler import StepLR

# init seed for replicable results
torch.manual_seed(42)

# ------------------- 1. build data ----------------------
# dummy 'Xor' dataset: should return true if (x_1 < 0.5 and x_2 < 0.5) or (x_1 > 0.5 and x_2 > 0.5)
dataset_size: int = 100000
features_tensor: torch.Tensor = torch.rand((dataset_size, 2))  # 2 features
# build boolean tensor of labels then convert it to numerical values
boolean_labels_tensor: torch.Tensor = torch.logical_xor(features_tensor[:, 0] < 0.5, features_tensor[:, 1] < 0.5)
labels_tensor: torch.Tensor = torch.where(boolean_labels_tensor, 1., 0.)

# Dataset is an abstraction over an iterator on data and is often re-implemented
dataset: TensorDataset = TensorDataset(features_tensor, labels_tensor)
# split dataset between training, validation and test set.
train_set, val_set, test_set = torch.utils.data.random_split(
    dataset, [int(dataset_size * 0.8), int(dataset_size * 0.15), int(dataset_size * 0.05)])

# Dataloader wrap a dataset (to manage shuffling, batching, etc. generically)
train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=64, shuffle=True)


# ------------------- 2. build model -------------------
class DenseModel(torch.nn.Module):
    def __init__(self):
        super(DenseModel, self).__init__()
        self.dense = nn.Sequential(
            nn.Linear(2, 8), nn.ReLU(),
            nn.Linear(8, 8), nn.ReLU(),
            nn.Linear(8, 8), nn.ReLU(),
            nn.Linear(8, 1)
        )
        # No sigmoid at the end, it's managed by the loss function

    def forward(self, x):
        output = self.dense(x)  # output (8,1) tensor
        one_d_output = output.squeeze()  # remove useless (size=1) dimension
        return one_d_output


dense_model = DenseModel()

# ------------------- 3. loss function -------------------
# Combine Sigmoid layer and logit loss (for numerical stability)
logit_loss_fct = nn.BCEWithLogitsLoss()


# ------------------- 4. evaluation function -------------------
def evaluate(model, dataloader, loss_fct, epoch):
    total_loss = 0
    nb_items = 0
    nb_corrects = 0
    for batch_idx, batch in enumerate(dataloader):
        batch_input_tensor = batch[0]
        batch_label_tensor = batch[1]
        predictions = model(batch_input_tensor)  # 1D tensor of size len(batch)
        loss_result = loss_fct(predictions, batch_label_tensor).squeeze()
        total_loss += loss_result

        # use broadcasting to compute the proportion of correct predictions
        is_result_positive = predictions > 0.0  # if y > 0 then \sigma(y)>0.5 and thus model predict "1.0" class
        is_expected_positive = batch_label_tensor == 1.0
        is_output_correct = is_result_positive == is_expected_positive
        nb_corrects += torch.sum(is_output_correct).squeeze()
        nb_items += batch_input_tensor.size(dim=0)

    print(f'epoch {epoch} mean loss: {total_loss/nb_items}, accuracy: {100*nb_corrects/nb_items}%')


# ------------------- 5. optimization loop -------------------
learning_rate = 0.1
optimizer: Optimizer = torch.optim.SGD(dense_model.parameters(), learning_rate)  # optimization algorithm (adam, SGD..)
scheduler: StepLR = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)  # reduce learning rate on optimizer

batch_size = 5
nb_epochs = 60
for epoch_index in range(nb_epochs):

    # Turn on the train mode (change behavior in layers like dropout and batchNorm)
    dense_model.train()
    for batch_idx, batch in enumerate(train_dataloader):

        # 1. initialize gradients on model
        optimizer.zero_grad()
        # 2. forward pass
        output = dense_model(batch[0])
        # 3. compute loss
        loss_result = logit_loss_fct(output, batch[1])
        # 4. backward pass update gradients
        loss_result.backward()
        # 5. gradient clipping, improve performance, cf. https://arxiv.org/pdf/1905.11881.pdf
        torch.nn.utils.clip_grad_norm_(dense_model.parameters(), 0.5)
        # 6. update model parameters using computed gradients during backward pass
        optimizer.step()

    # compute loss and accuracy on validation set
    dense_model.eval()  # Turn model back to eval mode
    evaluate(dense_model, val_dataloader, logit_loss_fct, epoch_index)

    scheduler.step()  # decrease learning rate after each epoch

Pandas

import pandas as pd
from datetime import datetime

# ------ build dataframe ------
df_by_cols = pd.DataFrame(
    {"c1": [4, 5, 6], "c2": [7, 8, 9]},
    index=["r1", "r2", "r3"])  # indexes are just row labels

df_by_rows = pd.DataFrame(
    [[4, 7, 10], [5, 8, 11]],
    columns=['c1', 'c2', 'c3'])

# df_from_csv = pd.read_csv('file.csv')

# ------ melt (unpivot) / pivot ------
df_funnel = pd.DataFrame(
    [["u1", datetime(2222, 1, 1, 3,17,20), datetime(2222, 1, 1, 3,18,5), datetime(2222, 1, 1, 3, 21, 2)],
    ["u2", datetime(2222, 1, 1, 4,18,20), datetime(2222, 1, 1, 4,18,40), datetime(2222, 1, 1, 4, 20, 2)]],
    columns=['name', 'landing', 'signup', 'payment']
)

# Return:
#     name event_type event_date
# 0   u1    landing   03:17:20
# 1   u2    landing   04:18:20
# 2   u1     signup   03:18:05
# ...
df_funnel_by_event = pd.melt(
    df_funnel,
    id_vars='name',  # id columns
    var_name='event_type',
    value_name='event_date'
)

df_funnel_repivoted = pd.pivot(
    df_funnel_by_event,
    index='name',
    columns=['event_type'],  # becomes column labels
    values=['event_date']  # becomes new columns content
)

# ------ basic infos ------
df_head = df_funnel.head()  # get first rows
df_sample = df_funnel.sample()  # select a few rows randomly
df_stats_by_col = df_funnel.describe()  # std, percentiles...by col
df_funnel.info()  # print directly: count, Dtype...

# ------ types ------
df_types = df_funnel.dtypes  # type of each column
with_landing_col_as_number = pd.to_numeric(df_funnel.landing)  # cast column

# ------ filter ------
filter_on_names = df_funnel.loc[[0,1], ['name', 'landing']]
filter_on_ids = df_funnel.iloc[0:2, 0:2]  # first 2 rows and cols

# ------ query ------
# query is the preferred way (more readable and faster on big dataset)
with_query = df_funnel.query('name == "u2" and landing > "2020-01-01"')
# old way: build 1D-array of bool by row using broadcasting
# then index with this bool array
with_operator = df_funnel[df_funnel.name == 'u2']

# ------ join (merge) ------
left = pd.DataFrame([['k1', 'k2'], ['vl1', 'vl2']],columns=['k', 'vl'])
right = pd.DataFrame([['k1', 'k3'], ['vr1', 'vr3']],columns=['k', 'vr'])
# 'left': k1, k2. 'right': k1, k3, 'inner': k1, 'outer': k1, k2, k3
pd.merge(left, right, how='left', on='k')  # ['k1', 'vl1', 'vr1'], k2

# ------ group-by ------
events_by_user = df_funnel_by_event.groupby('name')
first_event_by_user = events_by_user.agg({'event_date': 'min', 'event_type': 'first'})