# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(cv_X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]
plt.figure(figsize=(8, 3.5))
plt.plot(range(1, 10), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.title("Inertia vs Number of Clusters k",fontdict={'fontsize':20},pad=15 )
plt.annotate('Elbow',
             xy=(3, inertias[2]),
             xytext=(0.45, 0.45),
             textcoords='figure fraction',
             fontsize=16,
             arrowprops=dict(facecolor='black', shrink=0.1)
            )
plt.show()


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2

plt.figure(figsize=(11, 9))

for k in (3, 4, 5, 6):
    plt.subplot(2, 2, k - 2)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(cv_X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")
    
    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

plt.show()


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2

plt.figure(figsize=(10, 5))
plt.plot(range(1, 15), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.title("Inertia vs Number of Clusters k",fontdict={'fontsize':20},pad=15 )
plt.annotate('Elbow',
             xy=(10, inertias[9]),
             xytext=(0.75, 0.75),
             textcoords='figure fraction',
             fontsize=16,
             arrowprops=dict(facecolor='black', shrink=0.1)
            )
plt.show()


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2

plt.figure(figsize=(11, 9))

for k in (8, 9, 10, 11):
    plt.subplot(2, 2, k - 7)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(cv_X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (8, 10):
        plt.ylabel("Cluster")
    
    if k in (10, 11):
        plt.gca().set_xticks([-1, -0.8,-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

plt.show()


get_confusion_matrix(y,predict,mat_title="TFIDF Confusion Matrix")


get_confusion_matrix(y,cluster_pred,mat_title="TFIDF with Clustering Confusion Matrix")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
import email
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter

from scipy.sparse import hstack

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report,plot_confusion_matrix,plot_roc_curve, auc, roc_curve,confusion_matrix


import joblib
sns.set_theme()
# options
pd.set_option('display.max_columns', None)
# Using fixed os structure
import os
cwd = os.getcwd()
d = os.path.dirname(cwd)
d

# Notebook options
pd.options.display.max_rows = 99999
pd.options.display.max_columns = 99999
#pd.options.display.float_format = '{:20,.3f}'.format


# This code chunk go through all email file name and store the email location as well as
# its label in emails_name_label as dictionary
emails_name_label = {'email_location':[],'label':[]}
for folder in os.listdir(cwd+'/SpamAssassinMessages'):
    for root, dirs, files in os.walk(cwd+"/SpamAssassinMessages/"+folder, topdown=False):
        for name in files:
            emails_name_label['email_location'].append(os.path.join(root,name))
            if 'ham' in folder:
                emails_name_label['label'].append('ham')
            else:
                emails_name_label['label'].append('spam')
email_list = pd.DataFrame.from_dict(emails_name_label)


# This method uses LATIN1 encoding to read in message as email

def load_email(filename) -> email.message.Message:
    """Reads the email fine and returns an email message object"""
    with open(filename,"r",encoding='LATIN1') as f:
        return email.message_from_file(f)


# Get the email structure 
def get_email_structure(email: email.message.Message) -> str:
    """Get the structure for the email"""
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()


# This code chunk takes 
accepted_encoding = ['windows-1252','us-ascii','iso-8859-1','koi8-r','utf-8','iso-8859-15',
                     'iso-8859-2','unknown-8bit','windows-1251','default','iso-8859-3',
                     'default_charset','windows-1256',None]
unusual_email_encoding = []
# A list of email object that in the acceptable encoding
accepted_email_list = []
accepted_email_label = []
for file_loc, label in zip(email_list['email_location'],email_list['label']):
    email_object = load_email(file_loc)
    email_language = email_object.get_content_charset()
    #if email_language in ['ks_c_5601-1987']:
    if email_language not in accepted_encoding:
        #print(get_email_structure(email_object))
        #print(email_object.get_content_charset())
        #print(email_object.get_payload()[:250])
        #print(label)
        unusual_email_encoding.append(email_language)
    else:
        accepted_email_list.append(email_object)
        accepted_email_label.append(label)


len(set(unusual_email_encoding))

8


encodings_to_delete = list(set(unusual_email_encoding))


def html_to_plain(email: email.message.Message) -> str:
    """Convert html_to_plain email using BeautifulSoup package"""
    try:
        soup = BeautifulSoup(email.get_payload(), 'html.parser')
        # Get Rid of lines after parsing 
        return soup.text.replace('\n','')
    except:
        return "Failed to using BeautifulSoup for html parser"


def email_to_plain(email: email.message.Message) -> str:
    """Convert email content into plain text"""
    struct = get_email_structure(email)
    for part in email.walk():
        partContentType = part.get_content_type()
        if partContentType not in ['text/plain','text/html']:
            continue
        try:
            partContent = part.get_payload()
        except: # in case of encoding issues
            partContent = str(part.get_payload())
        if partContentType == 'text/plain':
            return partContent
        else:
            return html_to_plain(part)


def get_email(file: str) -> str:
    """
    Checks email structure and chooses the best action to load it.
    If multiplart alternative the returns structure as string, if not the
    returns a string with email content.
    
    """
    try: 
        if get_email_structure(load_email(file)) == "multipart/alternative":
            return "multipart/alternative"
        else: 
            return email_to_plain(load_email(file))
    except:
        return "cant"

def clean_email(email: str, text: str) -> str:
    """Replaces selected text from email with a blank space"""
    if email is not None:
        return email.replace(text, " ")
    else:
        return email
    
def get_encoding(file: str) -> str:
    """Gets the encoding from the email object"""
    email_object = load_email(file)
    email_language = email_object.get_content_charset()
    return email_language



def find_not_valid_chars(string: str) -> str:
    """Flags strings with no valid characters"""
    #print(type(string))
    if string is not None:
        regex = r'^[\w\d_()]*$' 
        if re.match(regex,string):
            return True
        else: 
            return False
    else:
        return None


email_list["raw_email"] = email_list["email_location"].apply(lambda x: get_email(x) )
email_list["raw_email"] = email_list["raw_email"].apply(lambda x: clean_email(x, "\n") )
email_list["raw_email"] = email_list["raw_email"].apply(lambda x: clean_email(x, "\t") )
email_list["encoding"] = email_list["email_location"].apply(lambda x: get_encoding(x) )
email_list["not_valid"] = email_list["raw_email"].apply(lambda x: find_not_valid_chars(x) )


email_list.encoding.value_counts(dropna=False)

NaN                3699
us-ascii           3275
iso-8859-1         1958
windows-1252        197
iso-8859-15         104
utf-8                22
default              15
iso-8859-2           13
gb2312               11
windows-1251         11
default_charset      10
iso-2022-jp           8
ks_c_5601-1987        7
windows-1254          6
koi8-r                6
iso-8859-9            4
unknown-8bit          2
chinesebig5           1
windows-1256          1
gb2312_charset        1
iso-8859-3            1
euc-kr                1
Name: encoding, dtype: int64


email_list[email_list['encoding']=="windows-1254"]


email_list = email_list[~email_list.encoding.isin(encodings_to_delete)]


email_list.encoding.value_counts(dropna=False)

NaN                3699
us-ascii           3275
iso-8859-1         1958
windows-1252        197
iso-8859-15         104
utf-8                22
default              15
iso-8859-2           13
windows-1251         11
default_charset      10
koi8-r                6
unknown-8bit          2
iso-8859-3            1
windows-1256          1
Name: encoding, dtype: int64


email_list.shape

(9314, 5)


email_list[email_list['encoding'].isnull()].head(20)


email_list.iloc[51].raw_email

'_/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/   S   P   E   C   I   A   L         R   E   P   O   R   T    How To Reliably Generate Hundreds Of Leads And Prospects Every Week!                 _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/ _/         Our research has found that many online entrepreneurs have tried one or more of the following...      Free Classifieds? (Don\'t work anymore)     Web Site? (Takes thousands of surfers)     Banners? (Expensive and losing their punch)     E-Zine? (Hope they have a -huge- subscriber list)     Search Engines? (Forget it, unless you\'re in the top 10)              S O   W H A T   D O E S   W O R K ?  Although often misunderstood, there is one method that has proven to succeed time-after-time.            E - M A I L   M A R K E T I N G ! !   Does the thought of $50,000 to $151,200.00 per year make you tingle with excitement?  Many of our customers make that and  more... Click here to find out how:  <a href="http://32.97.166.75/seinet.veri321/"> http://http://32.97.166.75/seinet.veri321/ </a>   HERE\'S WHAT THE EXPERTS HAVE TO SAY ABOUT E-MAIL MARKETING:   "A gold mine for those who can take advantage of bulk e-mail programs" - The New York Times  "E-mail is an incredible lead generation tool" - Crains Magazine   Click here to find out how YOU can do it:  <a href="http://32.97.166.75/seinet.veri321/"> http://http://32.97.166.75/seinet.veri321/ </a>      ==========================================================   If you no longer wish to hear about future offers from us, send us a message with STOP in the subject line, by clicking here:  mailto:list6644@postino.ch?Subject=Stop  Please do not include any correspondence in your message to this automatic stop robot--it will not be read. All requests processed automatically. **********************************************************   [7(^(PO1:KJ)_8J7BJK9^":}H&*T]    '


email_list[email_list.index==51]


find_not_valid_chars(email_list.iloc[1524].raw_email)

False


cant_read = []
for i in range(0,email_list.shape[0]):
    try :
        v = find_not_valid_chars(email_list.iloc[i].raw_email)
        if not isinstance(v, bool):
            cant_read.append(i)
            print(i)
            #print(response)
            #print("-"*20)
    except:
        print(i)

1498


cant_read

[1498]


email_list.not_valid.value_counts(dropna=False)

False    9306
True        7
NaN         1
Name: not_valid, dtype: int64


email_list[email_list["not_valid"]==True]


email_list[email_list["raw_email"]=="multipart/alternative"]


email_list[email_list["raw_email"]=="cant"]


email_list[email_list['raw_email'].isnull()]


email_list.iloc[1560,2]

"DC MOTOR,  GEAR MOTOR. New Offer 2002                For customer's  O.E.M  are most welcomed=20 If special requested RPM, length of shaft, torque, size, voltage, etc. = Please contact us=20  E-mail:   motorvan@sinaman.com                Please contact with Mr. = Van Lee        NO: HL101-D-D8xH6 VIBRATOR MOTOR  SIZE:  20 x 15.5 x H25 MM =20       PRICE:  US$ 0.12/PC      NO:HL307 DC MOTOR=20       SIZE: D12 x 19.5 MM                       PRICE: US$ 0.38/PC =20       NO: HL1011-DC  MOTOR FOR WHEELCHAIR, SCOOTER=20       SIZE: D100 x H70 MM       OUTPUT: 95 W                       PRICE: US$ 13.50/PC=20      NO: HL200 GENERATOR MOTOR=20       SIZE: D30.5 x H13 MM       PRICE: US$ 2.3     =20       WE PRODUCE MORE THAN 70 DIFFERENT DESIGN AND=20       SIZE OF  DC, GEAR AND AC MOTORS, THEY ARE SUITABLE =20        FOR OPERATION TOYS, HOUSEHOLD APPLIANCES, RECORDS,=20        SCOOTER, VIBRATOR, GENERATOR, WHEELCHAIR ETC.        -----ANY ENQUIRY ARE WELCOME-----        Hing Lung Motor Mfy,  Hong Kong    Tel : 852-24218309      = 852-24259526      Fax: 852-24817148           =20     =20  "


email_list.head()


# delete index 1524
email_list.drop([1524], inplace=True)


def cv_common(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Selects only selected columns from the given dataframe.
    Also resets index for number of compunations
    """
    cv_result_summary = df[columns]
    cv_result_summary.index  = np.arange(1,len(cv_result_summary)+1)
    cv_result_summary = cv_result_summary.reset_index()
    return cv_result_summary

def cv_summary(estimator: GridSearchCV, columns: list) -> pd.DataFrame: 
    """
    Creates a dataframe from the estimator CV results
    """
    df = pd.DataFrame(estimator.cv_results_)
    cv_result_summary = cv_common(df, columns)
    cv_result_summary = cv_result_summary.rename(columns = {
        "index": "param_combination"
    })
    return cv_result_summary

def get_acc_score(model, x, y):
    """
    Calculates socre from estimator (gridsearch) inherit scorer 
    """
    return model.score(x, y)


def get_confusion_matrix(y, yhat,mat_title="Confusion Matrix"):
    x_axis_labels = ['ham','spam']
    y_axis_labels = ['ham','spam']
    cm_n = confusion_matrix(y, yhat)
    ax = sns.heatmap(cm_n,cmap='Blues',annot=True,fmt='2d',xticklabels=x_axis_labels, yticklabels=y_axis_labels)
    ax.set(xlabel='Predicted Label', ylabel='True Label')
    ax.set_title(mat_title,fontdict={'fontsize':20},pad=15)

def get_classification_report(x_train, y_train, x_test, y_test, pred, model):
    """
    This function is used to get comprehensive classification report for binary response:
    Training Accuracy, Test Accuracy, print classification_report, plot confusion matrix.
    """
    print(f"Training Score: {get_acc_score(model, x_train, y_train):,.5f}")
    print("Classification Report:")
    print(classification_report(y_test,pred))
    get_confusion_matrix(y_test,pred)


X = email_list.raw_email
y = email_list.label


X[8943]

"Kenn Humborg wrote: >>Cheers All for your words of wisdom. >> >>I came across this which has worked a treat:- >> >>http://www.avdf.com/mar97/art_autorun.html >  >  > (This is all Windows-related autorun stuff).  Which was obvious since he mentioned: `start file.html` And the reason I didn't reply, Doh!  Pádraig.   --  Irish Linux Users' Group: ilug@linux.ie http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information. List maintainer: listmaster@linux.ie  "


y[8943]

'ham'


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

grid = [{
    "tfidf__max_features": [8000, 10000, 12000],
    "tfidf__ngram_range": [(1,1),(1,2),(1,3), (2,2), (3,3), (2,3)],
    "model__alpha": [0.1, 0.5, 1]
    
}]

model = MultinomialNB(alpha=1)

pipeline = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words="english", ngram_range = (1,3))),
    ('model', model)])

clf_search = GridSearchCV(pipeline, grid, scoring='roc_auc', cv = cv,n_jobs=-2, verbose=15, return_train_score=True  )


%%time
#search_clf = clf_search.fit(X, y)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 4.77 µs


file_name = f"{cwd}/tfidf_classification_m1.sav"
#joblib.dump(search_clf,file_name)
search_clf = joblib.load(file_name)


type(search_clf)

sklearn.model_selection._search.GridSearchCV


#search_clf.cv_results_


search_columns = ["param_tfidf__max_features", "param_tfidf__ngram_range" , "param_model__alpha" ,"mean_train_score", "mean_test_score","mean_fit_time","rank_test_score"]
df_search_result = cv_summary(search_clf,search_columns)
df_search_result


vocab = pd.DataFrame.from_dict(search_clf.best_estimator_.named_steps["tfidf"].vocabulary_, orient='index',).reset_index()
vocab = vocab.rename(columns={"index": "token", 0: "index"})


predict = search_clf.predict(X)
get_classification_report(X,y,X,y,predict,search_clf)

Training Score: 0.99957
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      6952
        spam       0.99      0.96      0.97      2361

    accuracy                           0.99      9313
   macro avg       0.99      0.98      0.98      9313
weighted avg       0.99      0.99      0.99      9313


vocab.sort_values("index").head()


feature_idf = pd.DataFrame.from_dict(dict(zip(search_clf.best_estimator_.named_steps["tfidf"].get_feature_names(), search_clf.best_estimator_.named_steps["tfidf"].idf_)), orient="index").reset_index()
feature_idf = feature_idf.rename(columns={"index": "Token", 0:"idf"}).reset_index()


feature_idf.head()


# Create tuning grid for CountVectorizer
grid_cv = [{
    "cv__max_features": [8000, 10000, 12000],
    "cv__ngram_range": [(1,1),(1,2),(1,3), (2,2), (3,3), (2,3)],
    "model__alpha": [0.1, 0.5, 1]
    
}]

model = MultinomialNB(alpha=1)

cv_pipeline = Pipeline(steps=[
    ('cv', CountVectorizer(stop_words="english", ngram_range = (1,3), max_features = 10000)),
    ('model', model)])

clf_cv_search = GridSearchCV(cv_pipeline, grid_cv, scoring='roc_auc', cv = cv,n_jobs=-2, verbose=15, return_train_score=True )


%%time
#search_cv_clf = clf_cv_search.fit(X, y)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


# Save model running result
file_name_cv = f"{cwd}/count_vectorizer_M1.sav"
#joblib.dump(search_cv_clf,file_name_cv)
search_cv_clf = joblib.load(file_name_cv)


search_cv_columns = ["param_cv__max_features", "param_cv__ngram_range" , "param_model__alpha" ,"mean_train_score", "mean_test_score","mean_fit_time","rank_test_score"]
df_cv_search_result = cv_summary(search_cv_clf,search_cv_columns)
df_cv_search_result


cv_predict = search_cv_clf.predict(X)
get_classification_report(X,y,X,y,cv_predict,search_cv_clf)

Training Score: 0.99831
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      6952
        spam       0.99      0.91      0.95      2361

    accuracy                           0.97      9313
   macro avg       0.98      0.95      0.96      9313
weighted avg       0.97      0.97      0.97      9313


countVec = CountVectorizer(stop_words="english", ngram_range = (2,2), max_features = 12000)
cv_X = countVec.fit_transform(X)


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(cv_X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]
plt.figure(figsize=(8, 3.5))
plt.plot(range(1, 10), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.title("Inertia vs Number of Clusters k",fontdict={'fontsize':20},pad=15 )
plt.annotate('Elbow',
             xy=(3, inertias[2]),
             xytext=(0.45, 0.45),
             textcoords='figure fraction',
             fontsize=16,
             arrowprops=dict(facecolor='black', shrink=0.1)
            )
plt.show()


silhouette_scores = [silhouette_score(cv_X, model.labels_)
                     for model in kmeans_per_k[1:]]


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2

plt.figure(figsize=(11, 9))

for k in (3, 4, 5, 6):
    plt.subplot(2, 2, k - 2)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(cv_X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")
    
    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

plt.show()


kmeans_label = KMeans(n_clusters=3, random_state=42).fit_predict(cv_X)


pd.Series(kmeans_label).value_counts()

0    9310
1       2
2       1
dtype: int64


# X = email_list.raw_email
# y = email_list.label


# Use best features selected by gridsearch
tfidf_vectorizer = TfidfVectorizer(stop_words="english", ngram_range = (1,1),max_features=12000)
X_cluster = tfidf_vectorizer.fit_transform(email_list.raw_email)


feature_idf_cluster = pd.DataFrame.from_dict(dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_)), orient="index").reset_index()
feature_idf_cluster = feature_idf_cluster.rename(columns={"index": "Token", 0:"idf"}).reset_index()


feature_idf_cluster.head()


kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X_cluster) for k in range(1, 15)]
inertias = [model.inertia_ for model in kmeans_per_k]


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2

plt.figure(figsize=(10, 5))
plt.plot(range(1, 15), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.title("Inertia vs Number of Clusters k",fontdict={'fontsize':20},pad=15 )
plt.annotate('Elbow',
             xy=(10, inertias[9]),
             xytext=(0.75, 0.75),
             textcoords='figure fraction',
             fontsize=16,
             arrowprops=dict(facecolor='black', shrink=0.1)
            )
plt.show()


silhouette_scores = [silhouette_score(cv_X, model.labels_)
                     for model in kmeans_per_k[1:]]


# Code adapted from Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition
# Github Link: https://github.com/ageron/handson-ml2

plt.figure(figsize=(11, 9))

for k in (8, 9, 10, 11):
    plt.subplot(2, 2, k - 7)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(cv_X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (8, 10):
        plt.ylabel("Cluster")
    
    if k in (10, 11):
        plt.gca().set_xticks([-1, -0.8,-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

plt.show()


kmeans_tf = KMeans(n_clusters=9, random_state=42).fit(X_cluster)
kmeans_tf_transform = kmeans_tf.transform(X_cluster)
kmeans_tf_label = kmeans_tf.predict(X_cluster)


pd.Series(kmeans_tf_label).value_counts()

2    5334
8    1040
3     677
7     675
6     620
5     464
1     279
4     131
0      93
dtype: int64


kmeans_tf_label

array([2, 2, 2, ..., 6, 6, 2], dtype=int32)


kmeans_tf_transform[0,:]

array([1.14029547, 1.15366353, 0.98345617, 1.07559584, 1.27833987,
       1.09660326, 1.10084031, 1.07685454, 1.00943011])


X_cluster.shape

(9313, 12000)


cluster_label_dummies = pd.get_dummies(kmeans_tf_label)


cluster_label_dummies.values

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=uint8)


# Merge TFIDF result with 10 cluster labels
X_train = hstack((cluster_label_dummies.values,X_cluster))


X_train.shape

(9313, 12009)


# Set new_model with best discovered alpha value under TFIDF gridsearch
new_model = model.set_params(alpha=0.1)


new_model.fit(X_train, y)

MultinomialNB(alpha=0.1)


cluster_pred = new_model.predict(X_train)
get_classification_report(X_train, y, X_train, y, cluster_pred, new_model)

Training Score: 0.98593
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      6952
        spam       0.99      0.96      0.97      2361

    accuracy                           0.99      9313
   macro avg       0.99      0.98      0.98      9313
weighted avg       0.99      0.99      0.99      9313

Model	ham precision	ham recall	spam precision	spam recall	overall accuracy
TFIDF alone	0.985	0.998	0.994	0.955	0.99
TFIDF with clustering	0.986	0.995	0.986	0.958	0.99
Count Vectorizer	0.969	0.997	0.990	0.907	0.97

	param_combination	param_tfidf__max_features	param_tfidf__ngram_range	param_model__alpha	mean_train_score	mean_test_score	mean_fit_time	rank_test_score
0	1	8000	(1, 1)	0.1	0.999382	0.998729	3.039519	4
1	2	8000	(1, 2)	0.1	0.998948	0.998279	8.319596	8
2	3	8000	(1, 3)	0.1	0.998492	0.997626	14.689524	14
3	4	8000	(2, 2)	0.1	0.998063	0.996905	6.540518	25
4	5	8000	(3, 3)	0.1	0.988855	0.985577	7.350119	48
5	6	8000	(2, 3)	0.1	0.996626	0.995078	13.036919	36
6	7	10000	(1, 1)	0.1	0.999493	0.998903	3.479531	2
7	8	10000	(1, 2)	0.1	0.999199	0.998582	8.756627	5
8	9	10000	(1, 3)	0.1	0.998910	0.998282	15.151999	7
9	10	10000	(2, 2)	0.1	0.998413	0.997321	6.962377	22
10	11	10000	(3, 3)	0.1	0.990499	0.987338	7.908995	47
11	12	10000	(2, 3)	0.1	0.997198	0.995754	14.170345	33
12	13	12000	(1, 1)	0.1	0.999596	0.998944	3.638346	1
13	14	12000	(1, 2)	0.1	0.999369	0.998775	8.839815	3
14	15	12000	(1, 3)	0.1	0.999070	0.998436	16.323902	6
15	16	12000	(2, 2)	0.1	0.998634	0.997534	7.590436	16
16	17	12000	(3, 3)	0.1	0.992188	0.988456	7.935834	46
17	18	12000	(2, 3)	0.1	0.997491	0.996206	13.972318	29
18	19	8000	(1, 1)	0.5	0.998671	0.997924	3.687517	12
19	20	8000	(1, 2)	0.5	0.998100	0.997327	8.980003	21
20	21	8000	(1, 3)	0.5	0.997209	0.996304	15.908257	28
21	22	8000	(2, 2)	0.5	0.996723	0.995372	7.762431	34
22	23	8000	(3, 3)	0.5	0.986002	0.982544	8.283588	51
23	24	8000	(2, 3)	0.5	0.994717	0.992959	15.951099	42
24	25	10000	(1, 1)	0.5	0.998814	0.998057	4.320406	10
25	26	10000	(1, 2)	0.5	0.998439	0.997777	9.483272	13
26	27	10000	(1, 3)	0.5	0.997897	0.997154	15.912296	23
27	28	10000	(2, 2)	0.5	0.997215	0.995850	7.263538	32
28	29	10000	(3, 3)	0.5	0.987847	0.984495	7.853939	50
29	30	10000	(2, 3)	0.5	0.995352	0.993804	14.050076	41
30	31	12000	(1, 1)	0.5	0.998918	0.998174	3.599073	9
31	32	12000	(1, 2)	0.5	0.998617	0.998003	9.102927	11
32	33	12000	(1, 3)	0.5	0.998155	0.997507	15.699314	18
33	34	12000	(2, 2)	0.5	0.997512	0.996105	7.192890	31
34	35	12000	(3, 3)	0.5	0.989310	0.985373	7.859293	49
35	36	12000	(2, 3)	0.5	0.995686	0.994392	14.195222	39
36	37	8000	(1, 1)	1	0.998068	0.997331	3.652363	20
37	38	8000	(1, 2)	1	0.997383	0.996547	10.013218	27
38	39	8000	(1, 3)	1	0.996066	0.995133	16.257380	35
39	40	8000	(2, 2)	1	0.995542	0.994179	7.179801	40
40	41	8000	(3, 3)	1	0.982574	0.978775	7.806855	54
41	42	8000	(2, 3)	1	0.992753	0.990970	14.181877	45
42	43	10000	(1, 1)	1	0.998237	0.997507	3.699699	17
43	44	10000	(1, 2)	1	0.997864	0.997142	8.988700	24
44	45	10000	(1, 3)	1	0.997013	0.996203	15.483819	30
45	46	10000	(2, 2)	1	0.996070	0.994561	7.152259	38
46	47	10000	(3, 3)	1	0.984671	0.981375	7.838837	53
47	48	10000	(2, 3)	1	0.993508	0.991810	13.999902	44
48	49	12000	(1, 1)	1	0.998416	0.997602	3.731122	15
49	50	12000	(1, 2)	1	0.998130	0.997449	9.175453	19
50	51	12000	(1, 3)	1	0.997409	0.996650	15.651640	26
51	52	12000	(2, 2)	1	0.996322	0.994764	7.182209	37
52	53	12000	(3, 3)	1	0.986098	0.982518	7.803256	52
53	54	12000	(2, 3)	1	0.993976	0.992483	13.356830	43

	token	index
503	00	0
429	000	1
10226	0000	2
10009	000000	3
8801	0000015	4

	index	Token	idf
0	0	00	2.770304
1	1	000	3.367338
2	2	0000	5.980391
3	3	000000	6.378074
4	4	0000015	9.040662

	param_combination	param_cv__max_features	param_cv__ngram_range	param_model__alpha	mean_train_score	mean_test_score	mean_fit_time	rank_test_score
0	1	8000	(1, 1)	0.1	0.997473	0.993652	3.350968	34
1	2	8000	(1, 1)	0.5	0.997126	0.993336	3.676781	38
2	3	8000	(1, 1)	1	0.996959	0.993005	4.000384	40
3	4	8000	(1, 2)	0.1	0.997944	0.995155	9.852231	17
4	5	8000	(1, 2)	0.5	0.997704	0.994684	9.772248	23
5	6	8000	(1, 2)	1	0.997530	0.994387	9.956829	26
6	7	8000	(1, 3)	0.1	0.997695	0.994917	18.316540	21
7	8	8000	(1, 3)	0.5	0.997392	0.994483	18.636695	25
8	9	8000	(1, 3)	1	0.997197	0.994225	18.668100	28
9	10	8000	(2, 2)	0.1	0.997464	0.995644	8.732738	10
10	11	8000	(2, 2)	0.5	0.996849	0.995045	8.248792	19
11	12	8000	(2, 2)	1	0.996435	0.994539	8.050007	24
12	13	8000	(3, 3)	0.1	0.985639	0.981325	8.951998	51
13	14	8000	(3, 3)	0.5	0.984026	0.979658	9.313209	53
14	15	8000	(3, 3)	1	0.982907	0.978282	9.728405	54
15	16	8000	(2, 3)	0.1	0.995310	0.993047	17.341450	39
16	17	8000	(2, 3)	0.5	0.994301	0.991846	17.046950	44
17	18	8000	(2, 3)	1	0.993543	0.990982	16.681782	45
18	19	10000	(1, 1)	0.1	0.997870	0.993901	4.549344	29
19	20	10000	(1, 1)	0.5	0.997548	0.993732	4.584543	32
20	21	10000	(1, 1)	1	0.997382	0.993474	4.652571	36
21	22	10000	(1, 2)	0.1	0.998365	0.995603	10.874572	11
22	23	10000	(1, 2)	0.5	0.998131	0.995155	10.985497	16
23	24	10000	(1, 2)	1	0.997995	0.994865	10.984679	22
24	25	10000	(1, 3)	0.1	0.998209	0.995756	18.503110	8
25	26	10000	(1, 3)	0.5	0.997879	0.995325	18.313150	15
26	27	10000	(1, 3)	1	0.997688	0.994997	18.341254	20
27	28	10000	(2, 2)	0.1	0.998064	0.996435	8.557629	2
28	29	10000	(2, 2)	0.5	0.997474	0.995874	8.687251	5
29	30	10000	(2, 2)	1	0.997061	0.995419	8.950945	13
30	31	10000	(3, 3)	0.1	0.987505	0.983268	9.810189	48
31	32	10000	(3, 3)	0.5	0.985798	0.981710	9.595962	50
32	33	10000	(3, 3)	1	0.984618	0.980215	9.431091	52
33	34	10000	(2, 3)	0.1	0.995977	0.993867	15.875292	30
34	35	10000	(2, 3)	0.5	0.995005	0.992774	16.088086	41
35	36	10000	(2, 3)	1	0.994298	0.991926	16.372616	43
36	37	12000	(1, 1)	0.1	0.998271	0.993732	4.806595	31
37	38	12000	(1, 1)	0.5	0.997983	0.993693	4.837062	33
38	39	12000	(1, 1)	1	0.997830	0.993580	4.811109	35
39	40	12000	(1, 2)	0.1	0.998575	0.995844	10.759497	7
40	41	12000	(1, 2)	0.5	0.998341	0.995381	10.848334	14
41	42	12000	(1, 2)	1	0.998188	0.995071	11.059584	18
42	43	12000	(1, 3)	0.1	0.998499	0.996108	18.728179	4
43	44	12000	(1, 3)	0.5	0.998297	0.995722	18.706324	9
44	45	12000	(1, 3)	1	0.998155	0.995441	18.723086	12
45	46	12000	(2, 2)	0.1	0.998365	0.996668	8.940322	1
46	47	12000	(2, 2)	0.5	0.997804	0.996241	8.945822	3
47	48	12000	(2, 2)	1	0.997419	0.995868	9.031996	6
48	49	12000	(3, 3)	0.1	0.989630	0.985378	10.172971	46
49	50	12000	(3, 3)	0.5	0.988185	0.983755	10.012217	47
50	51	12000	(3, 3)	1	0.986904	0.982512	10.141759	49
51	52	12000	(2, 3)	0.1	0.996396	0.994278	16.762884	27
52	53	12000	(2, 3)	0.5	0.995443	0.993350	17.323566	37
53	54	12000	(2, 3)	1	0.994787	0.992563	16.188169	42

Case Study Three¶

Spam email Classifier Using Clustering and Naive Bayes¶

David Grijalva, Nicole Norelli, & Mingyang Nick YU¶

10/01/2021¶

1. Introduction¶

Vectorizers¶

Naive Bayes¶

Cluster Analysis¶

KMeans Cluster¶

2. Methods¶

Initial Data Observations¶

Parsing emails¶

load_email¶

get_email_structure¶

html_to_plain¶

email_to_plain¶

get_email¶

clean_email¶

get_encoding¶

find_not_valid_chars¶

Train a supervised model¶

First Classification Model: MultinomialNB using TfidfVectorizer¶

Second Classification Model: MultinomialNB using CountVectorizer¶

Exploring additional labels using KMeans Clustering¶

First Clustering Model: using CountVectorizer¶

Second Clustering Model: using TfidfVectorizer¶

Third Classification Model: MultinomialNB using TfidfVectorizer and Clustering as preprocessing¶

3. Results¶

Models¶

4. Conclusion¶

Appendix - Code¶

Classification - TFIDF¶

Classification - CountVectorizer¶

Clustering - CountVectorizer¶

Clustering - TFIDF¶

	email_location	label	raw_email	encoding	not_valid
330	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	=3C!DOCTYPE HTML PUBLIC =22-=2F=2FW3C=2F=2FDTD...	windows-1254	False
438	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	=3C!DOCTYPE HTML PUBLIC =22-=2F=2FW3C=2F=2FDTD...	windows-1254	False
582	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	=3C!DOCTYPE HTML PUBLIC =22-=2F=2FW3C=2F=2FDTD...	windows-1254	False
981	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	=3C!DOCTYPE HTML PUBLIC =22-=2F=2FW3C=2F=2FDTD...	windows-1254	False
2300	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	=3Chtml=3E=3Chead=3E=3Cmeta http-equiv=3DConte...	windows-1254	False
2591	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	=3Chtml=3E=3Chead=3E=3Cmeta http-equiv=3DConte...	windows-1254	False

	email_location	label	raw_email	encoding	not_valid
1	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Friend,Now you can copy DVD's and Games http:/...	None	False
2	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Pocket the newest 8 year annuity! Pocket the...	None	False
5	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	CONSANTLY beingbombarded by so-called FREE m...	None	False
6	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	We dare you... Try to find a better annuity!...	None	False
10	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	An Additional Income Stream from your curren...	None	False
12	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	SGVyZSdzIHRoZSBob3R0ZXN0IHRoaW5nIGluIERWRHMuI...	None	False
14	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Protect your financial well-being. Purchase an...	None	False
18	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Click Here Now !Simply AmateurJust like the gi...	None	False
19	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Answer-Us Unlist Information This message is...	None	False
23	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Never Pay Retail!Direct Synergy - Household Cr...	None	False
25	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	FREE CD-ROM LESSONS http://isis.webstakes.com...	None	False
26	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Q09QWSBEVkQgTU9WSUVTIFRPIENELVINClJJR0hUIE5PV...	None	False
30	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Diversified Underwriters Services, Inc. Impa...	None	False
35	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Dear Sir / Madam If you are fed up of being '...	None	False
40	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	CONSANTLY beingbombarded by so-called FREE m...	None	False
41	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	ààªÍ¶¯ÄÔÇ¤ªäêé µ¶ÝÆààÌR...	None	False
43	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Outstanding Opportunities For Premier Produ...	None	False
50	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	PCFET0NUWVBFIEhUTUwgUFVCTElDICItLy9XM0MvL0RURC...	None	False
51	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	FREE CD-ROM LESSONS http://isis.webstakes.com...	None	False
53	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	ààªÍ¶¯ÄÔÇ¤ªäêé µ¶ÝÆààÌR...	None	False

	email_location	label	raw_email	encoding	not_valid
13	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam		iso-8859-1	True
78	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam		iso-8859-1	True
282	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam		None	True
726	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam		None	True
1543	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam		None	True
1861	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	4623	iso-8859-1	True
2056	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam		None	True

	email_location	label	raw_email	encoding	not_valid
460	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	multipart/alternative	None	False
793	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	multipart/alternative	None	False
2543	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	multipart/alternative	None	False

	email_location	label	raw_email	encoding	not_valid
0	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Dear Homeowner, Interest Rates are at their ...	windows-1252	False
1	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Friend,Now you can copy DVD's and Games http:/...	None	False
2	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Pocket the newest 8 year annuity! Pocket the...	None	False
3	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	Congratulations! You Get a Free Handheld Organ...	us-ascii	False
4	/Users/mingyang/Desktop/SMU/QTW/casestudy3/Spa...	spam	ATTENTION: This is a MUST for ALL Computer Use...	windows-1252	False