from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt


import pandas as pd
df = pd.read_csv("data/bs_clean.csv")
df.head()


import missingno as msno
msno.bar(df)

<AxesSubplot:>


sns.histplot(data=df, x="fraud")
plt.xticks(np.arange(0, 2, 1))

<AxesSubplot:xlabel='fraud', ylabel='Count'>

([<matplotlib.axis.XTick at 0x22b73b788e0>,
  <matplotlib.axis.XTick at 0x22b73b788b0>],
 [Text(0, 0, ''), Text(0, 0, '')])


sns.set(rc = {'figure.figsize':(15,8)})
sns.histplot(data=df, x="amount", hue="fraud", multiple="stack")

<AxesSubplot:xlabel='amount', ylabel='Count'>


# frequency count of field by fraud
sns.histplot(data=df, x="step", hue="fraud", multiple="stack")

<AxesSubplot:xlabel='step', ylabel='Count'>


# mean transaction amount by field by fraud
# the lines are error bars representing the uncertainty around the mean estimate
sns.lineplot(data=df, x="step", y="amount", hue="fraud")

<AxesSubplot:xlabel='step', ylabel='amount'>


sns.stripplot(data=df, x="step", y="amount", hue="fraud")
plt.xticks(np.arange(0, 180, 18))

<AxesSubplot:xlabel='step', ylabel='amount'>

([<matplotlib.axis.XTick at 0x22b08aaa550>,
  <matplotlib.axis.XTick at 0x22b08aaa520>,
  <matplotlib.axis.XTick at 0x22b08aa30a0>,
  <matplotlib.axis.XTick at 0x22b090f74f0>,
  <matplotlib.axis.XTick at 0x22b09096610>,
  <matplotlib.axis.XTick at 0x22b09042af0>,
  <matplotlib.axis.XTick at 0x22b09096910>,
  <matplotlib.axis.XTick at 0x22b090f75b0>,
  <matplotlib.axis.XTick at 0x22b0900fac0>,
  <matplotlib.axis.XTick at 0x22b09105910>],
 [Text(0, 0, '0'),
  Text(1, 0, '1'),
  Text(2, 0, '2'),
  Text(3, 0, '3'),
  Text(4, 0, '4'),
  Text(5, 0, '5'),
  Text(6, 0, '6'),
  Text(7, 0, '7'),
  Text(8, 0, '8'),
  Text(9, 0, '9')])


sns.histplot(data=df, x="age", hue="fraud", multiple="stack")

<AxesSubplot:xlabel='age', ylabel='Count'>


sns.barplot(data=df, x="age", y="amount", hue="fraud")

<AxesSubplot:xlabel='age', ylabel='amount'>


sns.histplot(data=df, x="gender", hue="fraud", multiple="stack")

<AxesSubplot:xlabel='gender', ylabel='Count'>


sns.barplot(data=df, x="gender", y="amount", hue="fraud")

<AxesSubplot:xlabel='gender', ylabel='amount'>


sns.histplot(data=df, y="category", hue="fraud", multiple="stack")

<AxesSubplot:xlabel='Count', ylabel='category'>


sns.barplot(data=df, y="category", x="amount", hue="fraud")

<AxesSubplot:xlabel='amount', ylabel='category'>


import matplotlib.pyplot as plt
import seaborn as sns

data = df
fig_dims = (6, 6)
fig, ax = plt.subplots(figsize = fig_dims)
sns.scatterplot(data=data, x = 'age', y = 'category', size='amount', hue='gender',alpha= 0.5, sizes=(10,1000), ax = ax)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=9)

<AxesSubplot:xlabel='age', ylabel='category'>

<matplotlib.legend.Legend at 0x22b0f83d8e0>


import pyalluvial.alluvial as alluvial 

freq_df = pd.read_csv("alluvial.csv")
fig = alluvial.plot(df=freq_df, xaxis_names=['gender','age', 'category'], y_name='count', alluvium='fraud', ignore_continuity=False, figsize=(20, 100))
plt.ylabel('Transactions')

Text(0, 0.5, 'Transactions')


# avg customer purchases per day
(df.groupby(["step"])["customer"].count()/df.groupby(["step"])["customer"].nunique()).mean()

# avg merchant sales per day
(df.groupby(["step"])["merchant"].count()/df.groupby(["step"])["merchant"].nunique()).mean()

1.0384574855236957

85.38324425832103


customer = {}
merchant = {}

for index, row in df.iterrows():
    #populating customer table
    if row.customer not in customer:
        customer[row.customer] = {}
        customer[row.customer]["purchases"] = []
        customer[row.customer]["stolen"] = []
        if row.fraud == 0:
            customer[row.customer]["transactions"] = 1
            customer[row.customer]["frauds"] = 0
            customer[row.customer]["purchases"].append(row.amount)
        elif row.fraud == 1:
            customer[row.customer]["transactions"] = 0
            customer[row.customer]["frauds"] = 1
            customer[row.customer]["stolen"].append(row.amount)   
    else:
        if row.fraud == 0:
            customer[row.customer]["transactions"] += 1
            customer[row.customer]["purchases"].append(row.amount)
        elif row.fraud == 1:
            customer[row.customer]["frauds"] += 1
            customer[row.customer]["stolen"].append(row.amount)
        
    #populating the merchant table    
    if row.merchant not in merchant:
        merchant[row.merchant] = {}
        merchant[row.merchant]["sales"] = []
        merchant[row.merchant]["stolen"] = []
        if row.fraud == 0:
            merchant[row.merchant]["transactions"] = 1
            merchant[row.merchant]["frauds"] = 0
            merchant[row.merchant]["sales"].append(row.amount)
        elif row.fraud == 1:
            merchant[row.merchant]["transactions"] = 0
            merchant[row.merchant]["frauds"] = 1
            merchant[row.merchant]["stolen"].append(row.amount)
    else:
        if row.fraud == 0:
            merchant[row.merchant]["transactions"] += 1
            merchant[row.merchant]["sales"].append(row.amount)
        elif row.fraud == 1:
            merchant[row.merchant]["frauds"] += 1
            merchant[row.merchant]["stolen"].append(row.amount)


# how many individual customers and merchants
len(customer)
len(merchant)

4112

50


Pavg = []
for index, row in c_df.iterrows():
    Pavg.append(sum(row["purchases"]) / row["transactions"])
c_df["avg_purchase"] = Pavg

Tavg = []
for index, row in c_df.iterrows():
    if row.frauds > 0:
        Tavg.append(sum(row["stolen"]) / row["frauds"])
    else:
        Tavg.append(0)
c_df["avg_theft"] = Tavg

c_df[c_df.avg_theft > 0].head()


# avg customer transactions over 180 days
c_df["transactions"].mean()

# avg customer fraud over 180 days
c_df["frauds"].mean()

# avg customer expenditure during this period
c_df["avg_purchase"].mean()

# avg customer fraud during this period
c_df["avg_theft"].mean()

142.86065175097275

1.7509727626459144

34.34324428801419

197.37380636624428


Savg = []
for index, row in m_df.iterrows():
    Savg.append(sum(row["sales"]) / row["transactions"])
m_df["avg_sale"] = Savg

Favg = []
for index, row in m_df.iterrows():
    if row.frauds > 0:
        Favg.append(sum(row["stolen"]) / row["frauds"])
    else:
        Favg.append(0)
m_df["avg_theft"] = Favg

m_df[m_df.avg_theft > 0].head()


# avg merchant transactions over 180 days
m_df["transactions"].mean()

# avg merchant fraud over 180 days
m_df["frauds"].mean()

# avg merchant revenue during this period
m_df["avg_sale"].mean()

# avg merchant fraud during this period
m_df["avg_theft"].mean()

11748.86

144.0

129.176631312623

356.63137368984593


exploited = df[["merchant", "category"]][df.merchant.isin(m_df[m_df.frauds > m_df.transactions].index)]
exploited.groupby(["merchant"])["category"].unique()

merchant
M1294758098          [leisure]
M1353266412    [hotelservices]
M17379832       [sportsandtoy]
M1873032707    [hotelservices]
M2011752106    [hotelservices]
M2080407379           [travel]
M2122776122             [home]
M3697346             [leisure]
M732195782            [travel]
M857378720     [hotelservices]
M980657600      [sportsandtoy]
Name: category, dtype: object


from scipy.stats import kurtosis, skew

print( 'excess kurtosis of normal distribution (should be 0): {}'.format( kurtosis(df.amount) ))
print( 'skewness of normal distribution (should be 0): {}'.format( skew(df.amount) ))

excess kurtosis of normal distribution (should be 0): 1425.3116885527731
skewness of normal distribution (should be 0): 32.36575650728976


df.amount.describe()

count    594643.000000
mean         37.890135
std         111.402831
min           0.000000
25%          13.740000
50%          26.900000
75%          42.540000
max        8329.960000
Name: amount, dtype: float64


df['amount'].plot.density(logx=True)

<AxesSubplot:ylabel='Density'>


# IQR
Q1 = df.amount.quantile(0.25)
Q3 = df.amount.quantile(0.75)
IQR = Q3 - Q1
print("Interquartile Range for amount: " )
IQR

# Outliers 

Maximum = Q3 + (1.5 * IQR)
print("Maximum outliers for amount: ")
Maximum

Interquartile Range for amount:

28.799999999999997

Maximum outliers for amount:

85.74


Q1 = df.amount.quantile(0.25)
Q3 = df.amount.quantile(0.75)
IQR = Q3-Q1

#conditions = [(df.amount > (Q3+1.5*IQR)), (df.amount < (Q1-1.5*IQR)), (df.amount < (Q3+1.5*IQR)) | (df.amount > (Q1+1.5*IQR))]
conditions = [(df.amount > (Q3+1.5*IQR)), (df.amount < (Q3+1.5*IQR))]
#values = [1, 1, 0]
values = [1, 0]
df["IQR_outlier"] = np.select(conditions, values)
df.head()


lowerBound = Q1 - 1.5*IQR
upperBound = Q3 + 1.5*IQR
print("The lower outlier bound for amount is: ", lowerBound) # non-existant
print("The upper outlier bound for amount is: ", upperBound)

The lower outlier bound for amount is:  -29.459999999999994
The upper outlier bound for amount is:  85.74


mean = df["amount"].mean()
std = np.std(df["amount"])
print('mean of the dataset is', mean)
print('std. deviation is', std)

threshold = 1.5
outlier = []
for i in df["amount"]:
    z = (i-mean)/std
    if abs(z) > threshold:
        outlier.append(i)
print('The number of outliers in the dataset is', len(outlier))

mean of the dataset is 37.89013530807561
std. deviation is 111.40273725877348
The number of outliers in the dataset is 7341


df["z_score"] = (df.amount-mean)/std
conditions = [(df.z_score > 1.5), (df.z_score < (-1.5)), (df.z_score < (1.5)) | (df.z_score > (-1.5))]
values = [1, 1, 0]
df["Z_outlier"] = np.select(conditions, values)
df.head()


df.fraud.value_counts()
df.IQR_outlier.value_counts()
df.Z_outlier.value_counts()

0    587443
1      7200
Name: fraud, dtype: int64

0    568845
1     25798
Name: IQR_outlier, dtype: int64

0    587302
1      7341
Name: Z_outlier, dtype: int64


df["category_code"].groupby(df["category"]).unique()

category
barsandrestaurants     [0]
content                [1]
fashion                [2]
food                   [3]
health                 [4]
home                   [5]
hotelservices          [6]
hyper                  [7]
leisure                [8]
otherservices          [9]
sportsandtoy          [10]
tech                  [11]
transportation        [12]
travel                [13]
wellnessandbeauty     [14]
Name: category_code, dtype: object


df["gender_code"].groupby(df["gender"]).unique()

gender
E    [0]
F    [1]
M    [2]
U    [3]
Name: gender_code, dtype: object


df["age_code"].groupby(df["age"]).unique()

age
0    [0]
1    [1]
2    [2]
3    [3]
4    [4]
5    [5]
6    [6]
U    [7]
Name: age_code, dtype: object


import seaborn as sns
sns.pairplot(df.sample(10000), hue="fraud", diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x22b14a3d220>


# correlation heatmap 
df.corr().style.background_gradient(cmap='coolwarm')


#Pearson chi square test age and fraud
#finding the association between age and fraud

from scipy.stats import chi2_contingency
from scipy.stats import chi2

columns = ['age', 'gender', 'category']
stats = []

for col in columns:
    result = {}
    myCrosstable = pd.crosstab(df[col], df['fraud'])
    chiVal, pVal, dof, exp = chi2_contingency(myCrosstable)


#interpret test-statistic

#Test Statistic >= Critical Value: reject null hypothesis, dependent (Ha)
#Test Statistic < Critical Value: fail to reject null hypothesis, independent (Ho)
#chi.ppf(q, df, loc=0, scale=1) inverset CDF

    prob = 0.95 #significant value = 1 - 0.95 = 0.05
    critical = chi2.ppf(prob, dof)
    result['column'] = col
    result['critical'] = round(critical, 2)
    result['chiVal'] = round(chiVal, 2)
    if chiVal >= critical:
        result['H0'] = 'reject/dependent'
    else:
        result['H0'] = 'accepted/independent'

    
# interpret 
# p-value <= alpha: reject null hypothesis, dependent (Ha)
# p-value > alpha: fail to reject null hypothesis, independedt (Ho)

    alpha = 0.05
    result['significance'] = round(alpha, 2)
    result['p'] = round(pVal, 2)
    if pVal <= alpha:
        result['dependent'] = 'Dependent (reject H0)'
    else:
        result['independent'] = 'Independent (fail to reject H0)'
    stats.append(result)


stats

[{'column': 'age',
  'critical': 14.07,
  'chiVal': 44.15,
  'H0': 'reject/dependent',
  'significance': 0.05,
  'p': 0.0,
  'dependent': 'Dependent (reject H0)'},
 {'column': 'gender',
  'critical': 7.81,
  'chiVal': 393.43,
  'H0': 'reject/dependent',
  'significance': 0.05,
  'p': 0.0,
  'dependent': 'Dependent (reject H0)'},
 {'column': 'category',
  'critical': 23.68,
  'chiVal': 193862.64,
  'H0': 'reject/dependent',
  'significance': 0.05,
  'p': 0.0,
  'dependent': 'Dependent (reject H0)'}]


# creating feature set

X =  df[["age_code", "gender_code", "category_code", "amount"]]

X.head()


y = df["fraud"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(416250, 4) (178393, 4) (416250,) (178393,)


from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

classifiers = [
    LogisticRegression(solver='liblinear', multi_class='ovr'),
    LinearSVC(dual=False, max_iter=2000),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(random_state=1),
    RandomForestClassifier(random_state=1),
    MLPClassifier(random_state=1)
]


from sklearn import model_selection

results = []
names = []
for classifier in classifiers:
	kfold = model_selection.KFold(n_splits=10)
	cv_results = model_selection.cross_val_score(classifier, X, y, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(classifier)
	msg = "%s: %f (%f)" % (classifier, cv_results.mean(), cv_results.std())
	print(msg)

LogisticRegression(multi_class='ovr', solver='liblinear'): 0.993425 (0.000609)
LinearSVC(dual=False, max_iter=2000): 0.993105 (0.000792)
KNeighborsClassifier(n_neighbors=3): 0.993682 (0.000804)
DecisionTreeClassifier(random_state=1): 0.991607 (0.000997)
RandomForestClassifier(random_state=1): 0.992483 (0.000882)
MLPClassifier(random_state=1): 0.994198 (0.000795)


# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names, rotation = 90)
plt.show()

Text(0.5, 0.98, 'Algorithm Comparison')

{'whiskers': [<matplotlib.lines.Line2D at 0x22b1cf8ee20>,
  <matplotlib.lines.Line2D at 0x22b1cf9f1f0>,
  <matplotlib.lines.Line2D at 0x22b1cfab790>,
  <matplotlib.lines.Line2D at 0x22b1cfabb20>,
  <matplotlib.lines.Line2D at 0x22b1cfc1100>,
  <matplotlib.lines.Line2D at 0x22b1cfc1490>,
  <matplotlib.lines.Line2D at 0x22b1cfcba60>,
  <matplotlib.lines.Line2D at 0x22b1cfcbdf0>,
  <matplotlib.lines.Line2D at 0x22b1cfe13d0>,
  <matplotlib.lines.Line2D at 0x22b1cfe1760>,
  <matplotlib.lines.Line2D at 0x22b1cfebd00>,
  <matplotlib.lines.Line2D at 0x22b266060d0>],
 'caps': [<matplotlib.lines.Line2D at 0x22b1cf9f580>,
  <matplotlib.lines.Line2D at 0x22b1cf9f910>,
  <matplotlib.lines.Line2D at 0x22b1cfabeb0>,
  <matplotlib.lines.Line2D at 0x22b1cfb5280>,
  <matplotlib.lines.Line2D at 0x22b1cfc1820>,
  <matplotlib.lines.Line2D at 0x22b1cfc1be0>,
  <matplotlib.lines.Line2D at 0x22b1cfd71c0>,
  <matplotlib.lines.Line2D at 0x22b1cfd7550>,
  <matplotlib.lines.Line2D at 0x22b1cfe1af0>,
  <matplotlib.lines.Line2D at 0x22b1cfe1e80>,
  <matplotlib.lines.Line2D at 0x22b26606460>,
  <matplotlib.lines.Line2D at 0x22b266067f0>],
 'boxes': [<matplotlib.lines.Line2D at 0x22b1cf8ebb0>,
  <matplotlib.lines.Line2D at 0x22b1cfab400>,
  <matplotlib.lines.Line2D at 0x22b1cfb5d30>,
  <matplotlib.lines.Line2D at 0x22b1cfcb6d0>,
  <matplotlib.lines.Line2D at 0x22b1cfe1040>,
  <matplotlib.lines.Line2D at 0x22b1cfeb970>],
 'medians': [<matplotlib.lines.Line2D at 0x22b1cf9fca0>,
  <matplotlib.lines.Line2D at 0x22b1cfb5610>,
  <matplotlib.lines.Line2D at 0x22b1cfc1f70>,
  <matplotlib.lines.Line2D at 0x22b1cfd78e0>,
  <matplotlib.lines.Line2D at 0x22b1cfeb250>,
  <matplotlib.lines.Line2D at 0x22b26606b80>],
 'fliers': [<matplotlib.lines.Line2D at 0x22b1cfab070>,
  <matplotlib.lines.Line2D at 0x22b1cfb59a0>,
  <matplotlib.lines.Line2D at 0x22b1cfcb340>,
  <matplotlib.lines.Line2D at 0x22b1cfd7c70>,
  <matplotlib.lines.Line2D at 0x22b1cfeb5e0>,
  <matplotlib.lines.Line2D at 0x22b26606f10>],
 'means': []}

[Text(1, 0, "LogisticRegression(multi_class='ovr', solver='liblinear')"),
 Text(2, 0, 'LinearSVC(dual=False, max_iter=2000)'),
 Text(3, 0, 'KNeighborsClassifier(n_neighbors=3)'),
 Text(4, 0, 'DecisionTreeClassifier(random_state=1)'),
 Text(5, 0, 'RandomForestClassifier(random_state=1)'),
 Text(6, 0, 'MLPClassifier(random_state=1)')]


classifier = MLPClassifier(random_state=1)
cls = classifier.fit(X_train, y_train)


# female aged 19-25 spending 5000 euro on sports and toys

y_output = cls.predict([[1, 1, 10, 5000]])
print(y_output)

[1]


# female aged 19-25 spending 50 euro on sports and toys

y_output = cls.predict([[1, 1, 10, 50]])
print(y_output)

[0]

Phase	Time	Resources	Risks
Business understanding	1 week	All analysts	Data problems, technology problems
Data understanding	3 weeks	All analysts	Data problems, technology problems
Data preparation	5 weeks	All analysts	Data problems, technology problems
Modeling	2 weeks	All analysts	Coding problems, technology problems
Evaluation	1 week	All analysts	Poor model results , inability to produce findings
Deployment	1 week	All analysts	Inability to implement models, no deployment launch

	customer	age	gender	merchant	category	amount
0	C1093826151	4	M	M348934600	transportation	4.55
1	C352968107	2	M	M348934600	transportation	39.68
2	C2054744914	4	F	M1823072687	transportation	26.89
3	C1760612790	3	M	M348934600	transportation	17.25
4	C757503768	5	M	M348934600	transportation	35.72

	purchases	stolen	transactions	frauds	avg_purchase	avg_theft
C765155274	[9.1, 14.39, 18.96, 36.39, 23.22, 8.41, 10.7, ...	[752.23]	175	1	39.201143	752.230
C623601481	[68.79, 58.38, 78.92, 2.78, 14.92, 31.77, 45.1...	[431.88, 2372.22, 521.63, 1888.43, 541.61]	89	5	28.985955	1151.154
C194016923	[30.19, 31.45, 8.54, 5.15, 24.48, 32.2, 10.8, ...	[164.04, 1142.23]	158	2	29.775127	653.135
C834963773	[40.69, 4.93, 20.86, 37.22, 20.19, 11.84, 51.6...	[747.24, 667.76, 437.47, 96.59, 244.63]	178	5	33.194944	438.738
C124539163	[10.09, 21.87, 20.25, 14.75, 29.59, 17.67, 55....	[4574.72, 85.87]	75	2	32.948533	2330.295

	sales	stolen	transactions	frauds	avg_sale	avg_theft
M50039827	[68.79, 59.51, 98.24, 163.03, 115.87, 20.7, 10...	[1025.56, 295.57, 493.79, 520.11, 130.56, 590....	870	46	105.229092	409.394130
M1888755466	[87.67, 25.0, 84.39, 24.29, 19.25, 116.01, 96....	[66.6, 189.22, 41.48, 572.01, 386.21, 226.78, ...	684	228	75.685497	316.469605
M480139044	[266.59, 44.14, 248.42, 55.82, 50.88, 83.93, 2...	[44.26, 324.5, 667.09, 520.5, 289.21, 560.9, 9...	1874	1634	103.299803	406.857032
M692898500	[171.07, 109.26, 187.62, 237.48, 195.44, 27.84...	[112.55, 830.57, 143.09, 607.85, 904.51, 411.0...	884	16	105.148835	418.039375
M348875670	[114.54, 127.84, 199.95, 35.57, 134.89, 154.92...	[112.44, 321.46, 145.84, 0.8, 141.22, 420.81, ...	97	10	111.385361	211.485000

	step	amount	fraud	age_code	gender_code	category_code	IQR_outlier	z_score	Z_outlier
step	1.000000	-0.007961	-0.011898	0.001169	-0.001107	-0.017269	-0.004574	-0.007961	-0.007755
amount	-0.007961	1.000000	0.489967	-0.003930	-0.012888	-0.098738	0.416670	1.000000	0.546959
fraud	-0.011898	0.489967	1.000000	-0.004315	-0.025047	-0.114272	0.444686	0.489967	0.669257
age_code	0.001169	-0.003930	-0.004315	1.000000	0.005020	0.004816	-0.002577	-0.003930	-0.002632
gender_code	-0.001107	-0.012888	-0.025047	0.005020	1.000000	0.007700	-0.016979	-0.012888	-0.018109
category_code	-0.017269	-0.098738	-0.114272	0.004816	0.007700	1.000000	-0.304427	-0.098738	-0.147449
IQR_outlier	-0.004574	0.416670	0.444686	-0.002577	-0.016979	-0.304427	1.000000	0.416670	0.524990
z_score	-0.007961	1.000000	0.489967	-0.003930	-0.012888	-0.098738	0.416670	1.000000	0.546959
Z_outlier	-0.007755	0.546959	0.669257	-0.002632	-0.018109	-0.147449	0.524990	0.546959	1.000000

Group 6: Bank Fraud Detection¶

Members:¶

Table of Contents¶

Objective Goal¶

Background Information¶

Design Principle¶

Determine Business Objectives¶

Background¶

Business Objectives¶

Business Success Criteria¶

Terminology¶

Costs and Benefits¶

ML Goals¶

ML Success Criteria¶

Project Plan¶

Initial Assessment of Tools and Techniques¶

Data Exploration / Processing¶

Data Visualizations¶

Class Imbalance¶

Distribution of Numeric Data¶

Bank Data Transaction Frequency¶

Bank Data Transaction Average Over Time¶

Bank Data Transaction Scatterplot Over Time¶

Transaction Frequency and Average By Age¶

Transaction Frequency and Average by Gender¶

Transaction Frequency and Average by Purchase Catgory¶

Binned Scatter Plot¶

Alluvial Chart¶

Analyses¶

Average Frequency of Transactions per Customer and per Merchant¶

Customer and Merchant Hash Tables Containing Individual Transaction Data¶

Source of Transactions¶

Average Transactions and Expenditures by Customer¶

Merchant Sale and Fradulent Transactions¶

Average Transactions and Expenditures by Merchant¶

Most Exploited Merchant¶

Statistics¶

Amount Distribution Shape¶

Amount Descriptive Statistics¶

Density Plot¶

Outlier Detection¶

Using IQR Method¶

Using Z-Score Method¶

Outlier Method Comparison¶

Label Encoded Legends¶

Pairplots showing Correlations Between Numeric Features¶

Correlation Heat-Map¶

Chi-Squared Test¶

Modelling¶

Feature Set and Label Preparation¶

Training and Testing Split¶

Selected Models¶

Classification Models¶

Logistic Regression¶

Support Vector Machines¶

K-Nearest Neighbor¶

Decision Tree¶

Random Forest¶

Neural Networks¶

Parameter Tuning¶

Cross Validation¶

Model Performance¶

Assessment of Results¶

Demo¶

ML Success Criteria Evaluation¶

Future Work¶