Quick references for Machine Learning
Learn from labeled data to predict outcomes
Uses: You have input-output pairs
Examples:
Algorithms:
Discover patterns in unlabeled data
Uses: Exploring data structure
Examples:
Algorithms:
Learn through trial-and-error with rewards
Uses: Sequential decision-making needed
Examples:
Algorithms:
Prevent overfitting and get honest performance estimates.
from sklearn.model_selection import train_test_split
# Split into train and temp
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Split temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42
)
Rule: Never train on test data!
Total Error = Bias² + Variance + Irreducible Error
Balance both to minimize total error. Use cross-validation to find it!
Goal: Generalize well to new dataUnderfitting:
Overfitting:
For Underfitting: More features, complex model, less regularization
For Overfitting: More data, dropout, early stopping, regularization (L1/L2)
Monitor both train and validation metricsPredicts continuous values: y = mx + b
Binary classification using sigmoid function
from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(X_train, y_train) predictions = model.predict(X_test)
Tree structure of if-else decisions
Ensemble of many decision trees (bagging)
from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100, max_depth=10) rf.fit(X_train, y_train) # Feature importance importances = rf.feature_importances_Great baseline model for tabular data
Find the hyperplane that maximizes margin between classes
Transform data to higher dimensions without computing coordinates
Pros: Effective in high dimensions, memory efficient
Cons: Slow on large datasets, requires feature scaling
from sklearn.svm import SVC svm = SVC(kernel='rbf', C=1.0, gamma='scale') svm.fit(X_train, y_train)Best for: Text classification, image recognition
Complex patterns, images, text, audio, large datasets
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential([
Dense(64, activation='relu', input_shape=(10,)),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X_train, y_train, epochs=50, batch_size=32)
Deep learning powerhouse
Classify based on k closest training examples
Partition data into k clusters (unsupervised)
Probabilistic classifier using Bayes' theorem
# k-NN from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5) # k-Means from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=3) # Naive Bayes from sklearn.naive_bayes import GaussianNB nb = GaussianNB()
Google's production-ready deep learning framework
import tensorflow as tf
from tensorflow import keras
# Sequential API (simple)
model = keras.Sequential([
keras.layers.Dense(64, activation='relu', input_shape=(10,)),
keras.layers.Dropout(0.2),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
history = model.fit(
X_train, y_train,
epochs=50,
batch_size=32,
validation_split=0.2,
callbacks=[keras.callbacks.EarlyStopping(patience=5)]
)
# Functional API (complex architectures)
inputs = keras.Input(shape=(10,))
x = keras.layers.Dense(64, activation='relu')(inputs)
x = keras.layers.Dense(32, activation='relu')(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
Facebook's research-focused deep learning framework
import torch
import torch.nn as nn
import torch.optim as optim
# Define model
class NeuralNet(nn.Module):
def __init__(self):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(10, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, 1)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.dropout(x)
x = torch.relu(self.fc2(x))
x = torch.sigmoid(self.fc3(x))
return x
model = NeuralNet()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
for epoch in range(50):
model.train()
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
| Aspect | TensorFlow | PyTorch |
|---|---|---|
| Ease of Use | Keras makes it easy | More Pythonic, intuitive |
| Learning Curve | Moderate | Easier for Python devs |
| Deployment | Excellent (TF Serving, Lite) | Good (TorchServe) |
| Research | Good | Dominant in academia |
| Debugging | Harder (static graphs) | Easier (dynamic graphs) |
| Community | Large, industry-focused | Large, research-focused |
Correct predictions / Total predictions
True Positives / (True Positives + False Positives)
True Positives / (True Positives + False Negatives)
Harmonic mean of precision and recall: 2 × (Precision × Recall) / (Precision + Recall)
Area Under the Receiver Operating Characteristic curve
from sklearn.metrics import classification_report, roc_auc_score print(classification_report(y_test, y_pred)) auc = roc_auc_score(y_test, y_pred_proba)Choose metric based on business impact
Average of squared differences: Σ(actual - predicted)² / n
Square root of MSE: √MSE
Average of absolute differences: Σ|actual - predicted| / n
Proportion of variance explained: 1 - (SS_res / SS_tot)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred)RMSE for magnitude, R² for model quality
Get more reliable performance estimates using all data for both training and validation
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Simple k-fold
scores = cross_val_score(model, X, y, cv=5)
print(f"Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True)
scores = cross_val_score(model, X, y, cv=skf)
Always use CV for model selection
Predicted
Pos Neg
Actual Pos TP FN
Neg FP TN
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import matplotlib.pyplot as plt cm = confusion_matrix(y_test, y_pred) disp = ConfusionMatrixDisplay(confusion_matrix=cm) disp.plot() plt.show()Always visualize your confusion matrix
Algorithms using distances (k-NN, SVM, Neural Networks) are sensitive to feature magnitude
Scale to [0, 1]: (x - min) / (max - min)
Scale to mean=0, std=1: (x - mean) / std
Use median and IQR: (x - median) / IQR
from sklearn.preprocessing import StandardScaler, MinMaxScaler # Standardization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Use same params! # Normalization normalizer = MinMaxScaler() X_train_norm = normalizer.fit_transform(X_train)⚠️ Fit only on training data!
df.isnull().sum() # Count missing per column df.isnull().sum() / len(df) * 100 # Percentage
Mean/Median:
Mode:
Forward/Backward Fill:
KNN Imputation:
from sklearn.impute import SimpleImputer, KNNImputer # Mean imputation imputer = SimpleImputer(strategy='mean') X_imputed = imputer.fit_transform(X) # KNN imputation knn_imputer = KNNImputer(n_neighbors=5) X_imputed = knn_imputer.fit_transform(X)Understand WHY data is missing
Convert categories to integers: Red→0, Blue→1, Green→2
from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df['color_encoded'] = le.fit_transform(df['color'])
Create binary column for each category
import pandas as pd # Pandas df_encoded = pd.get_dummies(df, columns=['color'], drop_first=True) # Scikit-learn from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(drop='first', sparse=False) encoded = ohe.fit_transform(df[['color']])
Replace category with mean of target variable
Replace with frequency/count of each category
from sklearn.feature_selection import SelectKBest, f_classif
# Select top k features
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
# From tree model
feature_imp = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
Domain knowledge > automated methods
Try every combination of specified parameters
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
RandomForestClassifier(),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
Sample random combinations
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
'n_estimators': [100, 200, 300, 400],
'max_depth': [10, 20, 30, 40, None],
'min_samples_split': [2, 5, 10, 15]
}
random_search = RandomizedSearchCV(
RandomForestClassifier(),
param_dist,
n_iter=20, # Number of random combinations
cv=5,
n_jobs=-1
)
random_search.fit(X_train, y_train)
Random Forest: n_estimators, max_depth, min_samples_split
SVM: C (regularization), kernel, gamma
Neural Networks: learning_rate, batch_size, hidden_layers, neurons
XGBoost: learning_rate, max_depth, n_estimators, subsample
Start with defaults, then tune most important paramsBinary Classification:
Multi-class Classification:
Regression:
Clustering:
Small Data (<10k samples):
Large Data (>100k samples):
High Dimensional (many features):
Imbalanced Classes:
Need interpretability? → Logistic Regression or Decision Tree
Need high accuracy? → XGBoost or Random Forest
Have images/text? → Neural Networks (CNN/RNN)
Limited time? → Start with Random Forest
Always try multiple algorithmsInformation from test set leaks into training
One class dominates dataset (e.g., 95% vs 5%)
# Check for data leakage
from sklearn.model_selection import cross_val_score
# If cross-validation score much worse than train score → leakage
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"CV: {cv_scores.mean():.3f}, Train: {train_score:.3f}")
⚠️ Always validate on unseen data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# 1. Load data
df = pd.read_csv('data.csv')
# 2. Basic exploration
print(df.info())
print(df.describe())
print(df.isnull().sum())
# 3. Prepare features and target
X = df.drop('target', axis=1)
y = df['target']
# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 5. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 6. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# 7. Evaluate
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# 8. Cross-validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
# Load data
df = pd.read_csv('file.csv')
# Exploration
df.head()
df.shape
df.dtypes
df.describe()
df.isnull().sum()
# Selection
df['column']
df[['col1', 'col2']]
df[df['age'] > 30]
# Missing values
df.dropna()
df.fillna(df.mean())
# Encoding
pd.get_dummies(df, columns=['category'])
# Group by
df.groupby('category')['value'].mean()
Bookmark this for quick reference!