Exercise 1:
Write a program that loads a dataset from a CSV file, splits it into training and testing sets using train_test_split, and fits a Support Vector Machine (SVM) classifier on the training data. Finally, evaluate the model using accuracy_score on the test set.
Dataset
import pandas as pd
from sklearn.datasets import make_classification
# Generate synthetic dataset
X, y = make_classification(
n_samples=1000,
n_features=5,
n_informative=3,
n_redundant=2,
n_classes=2,
random_state=42
)
# Create a DataFrame from the generated data
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])
df['target_variable'] = y
# Save the dataset to a CSV file
df.to_csv('dataset.csv', index=False)
Solution
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the dataset from CSV file
df = pd.read_csv('dataset.csv')
# Define the predictor variables and target variable
X = df.drop('target_variable', axis=1)
y = df['target_variable']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and fit the Support Vector Machine classifier
svm = SVC()
svm.fit(X_train, y_train)
# Make predictions on the test set
y_pred = svm.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Exercise 2:
Create a program that reads a dataset from a CSV file, preprocesses the data by scaling the numerical features and encoding categorical variables, and then performs dimensionality reduction using Principal Component Analysis (PCA). Fit a logistic regression model on the transformed data and evaluate its performance using cross_val_score.
Dataset
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# Generate synthetic dataset
X, y = make_classification(
n_samples=1000,
n_features=5,
n_informative=3,
n_redundant=2,
n_classes=2,
random_state=42
)
# Create a DataFrame from the generated data
df = pd.DataFrame(X, columns=['numerical1', 'numerical2', 'numerical3', 'categorical1', 'categorical2'])
df['target_variable'] = y
# Map categorical columns to string labels
df['categorical1'] = df['categorical1'].map({0: 'A', 1: 'B'})
df['categorical2'] = df['categorical2'].map({0: 'X', 1: 'Y'})
# Scale numerical features
scaler = MinMaxScaler()
df[['numerical1', 'numerical2', 'numerical3']] = scaler.fit_transform(df[['numerical1', 'numerical2', 'numerical3']])
# One-hot encode categorical variables
encoder = OneHotEncoder(sparse=False)
encoded_features = pd.DataFrame(encoder.fit_transform(df[['categorical1', 'categorical2']]), columns=encoder.get_feature_names(['categorical1', 'categorical2']))
df.drop(['categorical1', 'categorical2'], axis=1, inplace=True)
df = pd.concat([df, encoded_features], axis=1)
# Save the dataset to a CSV file
df.to_csv('dataset.csv', index=False)
Solution
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Load the dataset from CSV file
df = pd.read_csv('dataset.csv')
# Separate the predictor variables and target variable
X = df.drop('target_variable', axis=1)
y = df['target_variable']
# Preprocess the data
# Scale the numerical features
numerical_features = X.select_dtypes(include=['float64', 'int64'])
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)
# Encode categorical variables
categorical_features = X.select_dtypes(include=['object'])
encoder = OneHotEncoder(sparse=False)
encoded_categorical_features = encoder.fit_transform(categorical_features)
# Combine the scaled numerical and encoded categorical features
preprocessed_X = pd.DataFrame(
data=scaled_numerical_features,
columns=numerical_features.columns
).join(
pd.DataFrame(
data=encoded_categorical_features,
columns=encoder.get_feature_names(categorical_features.columns)
)
)
# Perform dimensionality reduction using PCA
pca = PCA(n_components=3)
transformed_X = pca.fit_transform(preprocessed_X)
# Fit a logistic regression model on the transformed data
logreg = LogisticRegression()
logreg.fit(transformed_X, y)
# Evaluate the model using cross_val_score
scores = cross_val_score(logreg, transformed_X, y, cv=5)
average_accuracy = scores.mean()
print("Average Accuracy:", average_accuracy)
Exercise 3:
Write a program that loads a dataset from a CSV file, splits it into training and testing sets, and trains a Random Forest Classifier on the training data. Use GridSearchCV to tune the hyperparameters of the Random Forest Classifier and find the best combination. Finally, evaluate the model's performance on the test set using classification_report.
Dataset
import pandas as pd
from sklearn.datasets import make_classification
# Generate synthetic dataset
X, y = make_classification(
n_samples=1000,
n_features=5,
n_informative=3,
n_redundant=2,
n_classes=2,
random_state=42
)
# Create a DataFrame from the generated data
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])
df['target_variable'] = y
# Save the dataset to a CSV file
df.to_csv('dataset.csv', index=False)
Solution
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
# Load the dataset from CSV file
df = pd.read_csv('dataset.csv')
# Separate the predictor variables and target variable
X = df.drop('target_variable', axis=1)
y = df['target_variable']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Random Forest Classifier
rf = RandomForestClassifier()
# Define the hyperparameters to tune
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
# Perform GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Get the best model
best_model = grid_search.best_estimator_
# Make predictions on the test set
y_pred = best_model.predict(X_test)
# Evaluate the model's performance
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
Exercise 4:
Create a program that reads a dataset from a CSV file, preprocesses the data by imputing missing values and scaling the features, and splits it into training and testing sets. Fit a K-Nearest Neighbors (KNN) classifier on the training data and determine the optimal value of K using cross-validation. Evaluate the model's performance on the test set using accuracy_score.
Dataset
import pandas as pd
from sklearn.datasets import make_classification
from numpy import nan
# Generate synthetic dataset with missing values
X, y = make_classification(
n_samples=1000,
n_features=5,
n_informative=3,
n_redundant=2,
n_classes=2,
random_state=42
)
# Introduce missing values
X[10:20, 1] = nan
X[50:55, 3] = nan
X[200:210, 2] = nan
# Create a DataFrame from the generated data
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])
df['target_variable'] = y
# Save the dataset to a CSV file
df.to_csv('dataset.csv', index=False)
Solution
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
# Load the dataset from CSV file
df = pd.read_csv('dataset.csv')
# Separate the predictor variables and target variable
X = df.drop('target_variable', axis=1)
y = df['target_variable']
# Preprocess the data
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Fit a K-Nearest Neighbors (KNN) classifier
k_values = [3, 5, 7, 9, 11] # Values of K to evaluate
best_accuracy = 0
best_k = 0
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train, y_train, cv=5)
average_accuracy = scores.mean()
if average_accuracy > best_accuracy:
best_accuracy = average_accuracy
best_k = k
# Fit the best KNN model on the training data
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
# Make predictions on the test set
y_pred = knn.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Exercise 5:
Write a program that loads a dataset from a CSV file, preprocesses the data by applying feature selection techniques such as SelectKBest or Recursive Feature Elimination (RFE). Split the data into training and testing sets and train a Decision Tree Classifier on the selected features. Evaluate the model's performance using a confusion matrix and plot the decision tree using graphviz.
Dataset
import pandas as pd
from sklearn.datasets import make_classification
# Generate synthetic dataset
X, y = make_classification(
n_samples=1000,
n_features=10,
n_informative=5,
n_redundant=2,
n_classes=2,
random_state=42
)
# Create a DataFrame from the generated data
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5',
'feature6', 'feature7', 'feature8', 'feature9', 'feature10'])
df['target_variable'] = y
# Save the dataset to a CSV file
df.to_csv('dataset.csv', index=False)
Solution
import pandas as pd
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
import graphviz
# Load the dataset from CSV file
df = pd.read_csv('dataset.csv')
# Separate the predictor variables and target variable
X = df.drop('target_variable', axis=1)
y = df['target_variable']
# Preprocess the data - Apply feature selection
# SelectKBest
kbest = SelectKBest(k=3) # Select top 3 features
X_selected = kbest.fit_transform(X, y)
# Recursive Feature Elimination (RFE)
# estimator = DecisionTreeClassifier() # or any other classifier
# rfe = RFE(estimator, n_features_to_select=3) # Select top 3 features
# X_selected = rfe.fit_transform(X, y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
# Train a Decision Tree Classifier on the selected features
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
# Make predictions on the test set
y_pred = dt.predict(X_test)
# Evaluate the model's performance using a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# Plot the decision tree using graphviz
dot_data = export_graphviz(dt, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree") # Save the decision tree to a file