Monday, June 5, 2023

Chapter 6: Predictive Analytics for Fashion Forecasting: Exercises and Solutions

Back to Table of Contents 

Exercise 1:

Write a program that loads a dataset from a CSV file, splits it into training and testing sets using train_test_split, and fits a Support Vector Machine (SVM) classifier on the training data. Finally, evaluate the model using accuracy_score on the test set.

Dataset 

import pandas as pd

from sklearn.datasets import make_classification


# Generate synthetic dataset

X, y = make_classification(

    n_samples=1000,

    n_features=5,

    n_informative=3,

    n_redundant=2,

    n_classes=2,

    random_state=42

)


# Create a DataFrame from the generated data

df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])

df['target_variable'] = y


# Save the dataset to a CSV file

df.to_csv('dataset.csv', index=False)


Solution

import pandas as pd

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score


# Load the dataset from CSV file

df = pd.read_csv('dataset.csv')


# Define the predictor variables and target variable

X = df.drop('target_variable', axis=1)

y = df['target_variable']


# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create and fit the Support Vector Machine classifier

svm = SVC()

svm.fit(X_train, y_train)


# Make predictions on the test set

y_pred = svm.predict(X_test)


# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Exercise 2:

Create a program that reads a dataset from a CSV file, preprocesses the data by scaling the numerical features and encoding categorical variables, and then performs dimensionality reduction using Principal Component Analysis (PCA). Fit a logistic regression model on the transformed data and evaluate its performance using cross_val_score.

Dataset

import pandas as pd

from sklearn.datasets import make_classification

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder


# Generate synthetic dataset

X, y = make_classification(

    n_samples=1000,

    n_features=5,

    n_informative=3,

    n_redundant=2,

    n_classes=2,

    random_state=42

)


# Create a DataFrame from the generated data

df = pd.DataFrame(X, columns=['numerical1', 'numerical2', 'numerical3', 'categorical1', 'categorical2'])

df['target_variable'] = y


# Map categorical columns to string labels

df['categorical1'] = df['categorical1'].map({0: 'A', 1: 'B'})

df['categorical2'] = df['categorical2'].map({0: 'X', 1: 'Y'})


# Scale numerical features

scaler = MinMaxScaler()

df[['numerical1', 'numerical2', 'numerical3']] = scaler.fit_transform(df[['numerical1', 'numerical2', 'numerical3']])


# One-hot encode categorical variables

encoder = OneHotEncoder(sparse=False)

encoded_features = pd.DataFrame(encoder.fit_transform(df[['categorical1', 'categorical2']]), columns=encoder.get_feature_names(['categorical1', 'categorical2']))

df.drop(['categorical1', 'categorical2'], axis=1, inplace=True)

df = pd.concat([df, encoded_features], axis=1)


# Save the dataset to a CSV file

df.to_csv('dataset.csv', index=False)


 Solution

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score


# Load the dataset from CSV file

df = pd.read_csv('dataset.csv')


# Separate the predictor variables and target variable

X = df.drop('target_variable', axis=1)

y = df['target_variable']


# Preprocess the data

# Scale the numerical features

numerical_features = X.select_dtypes(include=['float64', 'int64'])

scaler = StandardScaler()

scaled_numerical_features = scaler.fit_transform(numerical_features)


# Encode categorical variables

categorical_features = X.select_dtypes(include=['object'])

encoder = OneHotEncoder(sparse=False)

encoded_categorical_features = encoder.fit_transform(categorical_features)


# Combine the scaled numerical and encoded categorical features

preprocessed_X = pd.DataFrame(

    data=scaled_numerical_features,

    columns=numerical_features.columns

).join(

    pd.DataFrame(

        data=encoded_categorical_features,

        columns=encoder.get_feature_names(categorical_features.columns)

    )

)


# Perform dimensionality reduction using PCA

pca = PCA(n_components=3)

transformed_X = pca.fit_transform(preprocessed_X)


# Fit a logistic regression model on the transformed data

logreg = LogisticRegression()

logreg.fit(transformed_X, y)


# Evaluate the model using cross_val_score

scores = cross_val_score(logreg, transformed_X, y, cv=5)

average_accuracy = scores.mean()

print("Average Accuracy:", average_accuracy)


Exercise 3:

Write a program that loads a dataset from a CSV file, splits it into training and testing sets, and trains a Random Forest Classifier on the training data. Use GridSearchCV to tune the hyperparameters of the Random Forest Classifier and find the best combination. Finally, evaluate the model's performance on the test set using classification_report.

Dataset 

import pandas as pd

from sklearn.datasets import make_classification


# Generate synthetic dataset

X, y = make_classification(

    n_samples=1000,

    n_features=5,

    n_informative=3,

    n_redundant=2,

    n_classes=2,

    random_state=42

)


# Create a DataFrame from the generated data

df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])

df['target_variable'] = y


# Save the dataset to a CSV file

df.to_csv('dataset.csv', index=False)


Solution

import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report


# Load the dataset from CSV file

df = pd.read_csv('dataset.csv')


# Separate the predictor variables and target variable

X = df.drop('target_variable', axis=1)

y = df['target_variable']


# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create a Random Forest Classifier

rf = RandomForestClassifier()


# Define the hyperparameters to tune

param_grid = {

    'n_estimators': [100, 200, 300],

    'max_depth': [None, 5, 10],

    'min_samples_split': [2, 5, 10]

}


# Perform GridSearchCV to find the best combination of hyperparameters

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)


# Get the best model

best_model = grid_search.best_estimator_


# Make predictions on the test set

y_pred = best_model.predict(X_test)


# Evaluate the model's performance

report = classification_report(y_test, y_pred)

print("Classification Report:")

print(report)


Exercise 4:

Create a program that reads a dataset from a CSV file, preprocesses the data by imputing missing values and scaling the features, and splits it into training and testing sets. Fit a K-Nearest Neighbors (KNN) classifier on the training data and determine the optimal value of K using cross-validation. Evaluate the model's performance on the test set using accuracy_score.

Dataset 

import pandas as pd

from sklearn.datasets import make_classification

from numpy import nan


# Generate synthetic dataset with missing values

X, y = make_classification(

    n_samples=1000,

    n_features=5,

    n_informative=3,

    n_redundant=2,

    n_classes=2,

    random_state=42

)


# Introduce missing values

X[10:20, 1] = nan

X[50:55, 3] = nan

X[200:210, 2] = nan


# Create a DataFrame from the generated data

df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])

df['target_variable'] = y


# Save the dataset to a CSV file

df.to_csv('dataset.csv', index=False)


Solution

import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score


# Load the dataset from CSV file

df = pd.read_csv('dataset.csv')


# Separate the predictor variables and target variable

X = df.drop('target_variable', axis=1)

y = df['target_variable']


# Preprocess the data

# Impute missing values

imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(X)


# Scale the features

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_imputed)


# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Fit a K-Nearest Neighbors (KNN) classifier

k_values = [3, 5, 7, 9, 11]  # Values of K to evaluate

best_accuracy = 0

best_k = 0


for k in k_values:

    knn = KNeighborsClassifier(n_neighbors=k)

    scores = cross_val_score(knn, X_train, y_train, cv=5)

    average_accuracy = scores.mean()


    if average_accuracy > best_accuracy:

        best_accuracy = average_accuracy

        best_k = k


# Fit the best KNN model on the training data

knn = KNeighborsClassifier(n_neighbors=best_k)

knn.fit(X_train, y_train)


# Make predictions on the test set

y_pred = knn.predict(X_test)


# Evaluate the model's performance

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)



Exercise 5:

Write a program that loads a dataset from a CSV file, preprocesses the data by applying feature selection techniques such as SelectKBest or Recursive Feature Elimination (RFE). Split the data into training and testing sets and train a Decision Tree Classifier on the selected features. Evaluate the model's performance using a confusion matrix and plot the decision tree using graphviz.

Dataset

import pandas as pd

from sklearn.datasets import make_classification


# Generate synthetic dataset

X, y = make_classification(

    n_samples=1000,

    n_features=10,

    n_informative=5,

    n_redundant=2,

    n_classes=2,

    random_state=42

)


# Create a DataFrame from the generated data

df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5',

                              'feature6', 'feature7', 'feature8', 'feature9', 'feature10'])

df['target_variable'] = y


# Save the dataset to a CSV file

df.to_csv('dataset.csv', index=False)


Solution

import pandas as pd

from sklearn.feature_selection import SelectKBest, RFE

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix

from sklearn.tree import export_graphviz

import graphviz


# Load the dataset from CSV file

df = pd.read_csv('dataset.csv')


# Separate the predictor variables and target variable

X = df.drop('target_variable', axis=1)

y = df['target_variable']


# Preprocess the data - Apply feature selection

# SelectKBest

kbest = SelectKBest(k=3)  # Select top 3 features

X_selected = kbest.fit_transform(X, y)


# Recursive Feature Elimination (RFE)

# estimator = DecisionTreeClassifier()  # or any other classifier

# rfe = RFE(estimator, n_features_to_select=3)  # Select top 3 features

# X_selected = rfe.fit_transform(X, y)


# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


# Train a Decision Tree Classifier on the selected features

dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)


# Make predictions on the test set

y_pred = dt.predict(X_test)


# Evaluate the model's performance using a confusion matrix

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")

print(cm)


# Plot the decision tree using graphviz

dot_data = export_graphviz(dt, out_file=None, filled=True, rounded=True, special_characters=True)

graph = graphviz.Source(dot_data)

graph.render("decision_tree")  # Save the decision tree to a file


No comments: