# set matplotlib backend to inline
%matplotlib inline 

# import modules
from sklearn import datasets 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd

# load data
wine = datasets.load_wine()
#print(wine.DESCR)

# this dataset has 13 features, we will only choose a subset of these
df_wine = pd.DataFrame(wine.data, columns = [wine.feature_names])
selected_features = ['alcohol','flavanoids','color_intensity','ash']

# extract the data as numpy arrays of features, X, and target, y
X = df_wine[selected_features].values
y = wine.target


# load data
iris = datasets.load_iris()
#print(iris.DESCR)

# this dataset has 13 features, we will only choose a subset of these
df_iris = pd.DataFrame(iris.data, columns = iris.feature_names )

features = []
for col in df_iris.columns:
    features.append(col)
print(features)
    
# extract the data as numpy arrays of features, X, and target, y
X1 = df_iris.values
y1 = iris.target

# define plotting function
# import mpl for use with color definition
import matplotlib as mpl
import scipy.stats as stats

# define plotting function
def myPlotGrid(X, y, features):
    """
    Summary line.

    This function is suited for a dataset with four columns.
    A plot figure with a 4 x 4 grid will be created where the
    data contained in the columns would be compared to one another.

    Parameters:
        X (list): List of data values, should have 4 columns per row.
        y (list): List of the target values, should have the same ammount of rows as X.
        features (list strings): The column names, will be used to for the x and y labels.

    Result:
        Will visualize the data in charts.
    """
    
    # Create dict to save subplots
    plotObjs = {}
    
    # Setup the chart
    fig, axs = plt.subplots(4, 4)
    fig.set_size_inches(12, 14)
    fig.suptitle('Wine dataset', fontsize=18, y=0.91)
    
    # Setup the colour scheme to for the plots
    levels = [0, 1, 2]
    colors = [plt.cm.Paired(1), plt.cm.Paired(7), plt.cm.Paired(3)]
    cmap, norm = mpl.colors.from_levels_and_colors(levels=levels, colors=colors, extend='max') 
    
    # This for loop is for plots going down the y axis
    for i in range(0, 4):
        
        # This for loop is for the plots going down the x axis
        for j in range(0, 4):
            
            # Keep count of the plot number with the itteration variable
            itteration = i*4+(j)

            # We want line plots on plot number 0, 5, 10 and 15.
            if itteration == 0 or itteration == 5 or itteration == 10 or itteration == 15:
                # Setup the plot
                plotObjs[itteration] = axs[i,j].hist(X[:,j], bins=150, histtype=u'step', density=True)

                density = stats.gaussian_kde(X[:,j])
                #axs[i,j].plot(X[:,j], density(X[:,j]))
                
                # Set the x and y labels
                axs[i,j].set(xlabel = features[j], ylabel = features[i])
            
            # Scatter plots on the rest
            else:
                # Setup the plot
                plotObjs[itteration] = axs[i,j].scatter(X[:,j], X[:,i], c=y, cmap=cmap, norm=norm, edgecolor='white', alpha=0.8, label="")
                # Set the x and y labels
                axs[i,j].set(xlabel = features[j], ylabel = features[i])

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    for ax in axs.flat:
        ax.label_outer()
    
    # Create the legend
    text_labels = ['0', '1', '2']
    handles, labels = axs[0,1].get_legend_handles_labels()
    fig.legend(plotObjs[1].legend_elements()[0], text_labels, loc="center right", borderaxespad=0.5, title="Wine Type")
    
    plt.show()

myPlotGrid(X1, y1, features)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


def myplotGrid(data, target):
    """
    
    Function description.
    ----------------------
    This function is suited for a dataset with four columns.
    A plot figure with a 4 x 4 grid will be created where the data contained in the 
    columns would be compared to one another and visualized in a scatter plot.
    The data on the diagonal will be a visualized in a distribution plot.

    Parameters
    -----------------------
        X (list): List of data values.
        y (list): List of the target values.

    Result
    -----------------------
        Will visualize the data in charts.
        
    """
    
    # Import Seaborn
    import seaborn as sns
    
    # Create a dataframe to use with seaborn
    dataFrame = pd.DataFrame(data, columns=selected_features)
    
    # Add the target values to the dataframe
    dataFrame["target"] = np.array(target)
    
    # Plot the data
    sns.pairplot(dataFrame, vars=dataFrame.columns[:4], hue="target")


# Print the function docstring
? myplotGrid

Signature:  myplotGrid(data, target)
Docstring:
Function description.
----------------------
This function is suited for a dataset with four columns.
A plot figure with a 4 x 4 grid will be created where the data contained in the 
columns would be compared to one another and visualized in a scatter plot.
The data on the diagonal will be a visualized in a distribution plot.

Parameters
-----------------------
    X (list): List of data values.
    y (list): List of the target values.

Result
-----------------------
    Will visualize the data in charts.
    
File:      c:\users\sjhen\appdata\local\temp\ipykernel_18904\3258129607.py
Type:      function


# run the plotting function with the wine data and target values
myplotGrid(X, y)


# noise code
mySeed = 12345 
np.random.seed(mySeed) 
XN=X+np.random.normal(0,0.6,X.shape)

myplotGrid(XN,y)


# Euclidean Distance function
def distance_ecu(x_train, x_test_point):
    """
    Input:
    - x_train: corresponding to the training data
    - x_test_point: corresponding to the test point

    Output:
    -distances: The distances between the test point and each point in the training data.
    """
    
    # initialize empty list
    distances= []
    
    # loop over the rows of x_train
    for row in range(len(x_train)): 
        # get them point by point
        current_train_point= x_train[row]
        # initialize the distance by zero
        current_distance= 0 
        
        # loop over the columns of the current row
        for col in range(len(current_train_point)):
            # Using the euclidean formula, we subtract and then square the result
            current_distance += (current_train_point[col] - x_test_point[col]) **2
        
        # get the square root of the cumalative distance
        current_distance= np.sqrt(current_distance)
        
        distances.append(current_distance)

    # store distances in a dataframe
    distances= pd.DataFrame(data=distances,columns=['dist'])
    return distances

# Reference:
# https://deepnote.com/@ndungu/Implementing-KNN-Algorithm-on-the-Iris-Dataset-58Fkk1AMQki-VJOJ3mA_Fg
# By John Ndungu - Sep 23, 2021


# Function to find the nearest neighbors
def nearest_neighbors(distance_point, K):
    """
    Input:
        -distance_point: the distances between the test point and each point in the training data.
        -K             : the number of neighbors

    Output:
        -df_nearest: the nearest K neighbors between the test point and the training data.
    """
    
    # sort values using the sort_values function
    df_nearest = distance_point.sort_values(by=['dist'], axis=0)

    # take only the first K neighbors
    df_nearest= df_nearest[:K]
    
    return df_nearest

# Reference:
# https://deepnote.com/@ndungu/Implementing-KNN-Algorithm-on-the-Iris-Dataset-58Fkk1AMQki-VJOJ3mA_Fg
# By John Ndungu - Sep 23, 2021


# Function to classify the point based on a majority vote
from collections import Counter

def voting(df_nearest, y_train):
    """
    Input:
        -df_nearest: dataframe contains the nearest K neighbors between the full training dataset and the test point.
        -y_train: the labels of the training dataset.

    Output:
        -y_pred: the prediction based on Majority Voting
    """
    
    # use the counter object to get the labels with K nearest neighbors.
    counter_vote = Counter(y_train[df_nearest.index])
    y_pred = counter_vote.most_common()[0][0]
    
    return y_pred

# Reference:
# https://deepnote.com/@ndungu/Implementing-KNN-Algorithm-on-the-Iris-Dataset-58Fkk1AMQki-VJOJ3mA_Fg
# By John Ndungu - Sep 23, 2021


# Calling the function in sequence
def KNN_from_scratch(x_train, y_train, x_test, K):
    """
    Input:
    -x_train: the full training dataset
    -y_train: the labels of the training dataset
    -x_test: the full test dataset
    -K: the number of neighbors

    Output:
    -y_pred: the prediction for the whole test set based on Majority Voting.
    """
    
    y_pred = []
    y_distances = []

    ## loop over all the test set and perform the three steps
    for x_test_point in x_test:
        
        # Step 1
        distance_point  = distance_ecu(x_train, x_test_point)
        y_distances.append(distance_point)
        
        # Step 2
        df_nearest_point= nearest_neighbors(distance_point, K)
        
        # Step 3
        y_pred_point    = voting(df_nearest_point, y_train)
        
        # Append result to the list
        y_pred.append(y_pred_point)

    return y_pred, y_distances

# Reference:
# https://deepnote.com/@ndungu/Implementing-KNN-Algorithm-on-the-Iris-Dataset-58Fkk1AMQki-VJOJ3mA_Fg
# By John Ndungu - Sep 23, 2021


# Normalizer function
def min_max_normalize(data):
    """
    Input:
    -data: the training or testing dataset

    Output:
    -normalized: normalized list, same format as input data.
    """
    
    normalized = []
    
    # List to save new values in
    maxList = []
    minList = []
    
    # Get the min and max from each column
    for i in range(0, len(data[0])):
        
        # Select all rows of a column
        col = data[:,i]
        
        # Get the max and min from the column
        minList.append(min(col))
        maxList.append(max(col))
    
    # Check and verify
    # print(maxList)
    # print(minList)
        
    # Itterate over the rows and normalize the data
    count = 0
    for row in data:
        rowList = []
        count += 1
        
        for index in range(0, len(row)):
            norm = ((row[index] - minList[index]) / (maxList[index] - minList[index]))
            rowList.append(norm)
            
        normalized.append(rowList)
        
        # Break for testing purposes
        #if count == 2:
        #    break
            
    #print(normalized)
    return normalized

# Reference:
# Codecadamy.com
# K-nearest neighbors
# Data with differenct scales: Normalization


# Import required libraries
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_train, x_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, shuffle= True, random_state= 0)

y_pred_scratch, y_pred_distances = KNN_from_scratch(x_train, y_train, x_test, 5)
print(y_pred_scratch)
print(y_pred_distances)

[0, 2, 0, 0, 1, 0, 0, 2, 1, 1, 2, 2, 0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0]
[         dist
0    2.358622
1    3.319066
2    3.415392
3    2.625548
4    2.387509
..        ...
137  4.446909
138  2.040760
139  4.094655
140  0.587452
141  4.460022

[142 rows x 1 columns],          dist
0    5.065570
1    7.754412
2    7.695544
3    3.604747
4    7.421718
..        ...
137  8.818554
138  6.193061
139  8.783023
140  5.250962
141  1.876832

[142 rows x 1 columns],          dist
0    3.222313
1    2.182201
2    1.650000
3    3.848155
4    1.607607
..        ...
137  2.867856
138  1.149130
139  2.595303
140  2.269163
141  6.052190

[142 rows x 1 columns],          dist
0    2.255793
1    3.574815
2    3.614748
3    2.291157
4    2.740055
..        ...
137  4.713778
138  2.186550
139  4.405077
140  0.791075
141  4.072996

[142 rows x 1 columns],          dist
0    3.962853
1    1.328571
2    1.625423
3    5.093623
4    1.131769
..        ...
137  1.699147
138  2.292466
139  1.002846
140  3.676982
141  7.454334

[142 rows x 1 columns],          dist
0    5.045572
1    4.901500
2    4.052283
3    4.938320
4    4.049605
..        ...
137  5.408077
138  3.687953
139  5.126880
140  3.094156
141  6.340103

[142 rows x 1 columns],          dist
0    2.152487
1    2.664939
2    3.060229
3    2.934723
4    1.632330
..        ...
137  3.904805
138  1.931114
139  3.461575
140  1.463762
141  4.992544

[142 rows x 1 columns],          dist
0    2.248777
1    4.644879
2    4.659742
3    1.114899
4    4.611453
..        ...
137  5.684021
138  3.193587
139  5.714604
140  3.307960
141  2.738777

[142 rows x 1 columns],          dist
0    3.249815
1    0.812773
2    0.705620
3    4.277908
4    1.467447
..        ...
137  1.479932
138  1.301422
139  1.353883
140  3.444140
141  6.725816

[142 rows x 1 columns],          dist
0    3.954049
1    0.894315
2    1.230163
3    5.132300
4    2.141728
..        ...
137  0.629603
138  2.385896
139  0.772140
140  4.540319
141  7.634265

[142 rows x 1 columns],          dist
0    0.476445
1    2.807330
2    3.413590
3    1.723659
4    2.977533
..        ...
137  4.052616
138  2.064316
139  3.996111
140  2.991087
141  4.184125

[142 rows x 1 columns],          dist
0    1.068176
1    2.468583
2    3.393965
3    2.494193
4    2.710369
..        ...
137  3.757885
138  2.276005
139  3.641607
140  3.279482
141  4.828095

[142 rows x 1 columns],          dist
0    3.005661
1    4.441970
2    4.495798
3    2.813006
4    3.399544
..        ...
137  5.572064
138  3.137961
139  5.188651
140  0.788289
141  3.992631

[142 rows x 1 columns],          dist
0    3.743862
1    2.733971
2    2.092248
3    4.247976
4    1.948589
..        ...
137  3.279726
138  1.768417
139  2.952931
140  2.242967
141  6.300079

[142 rows x 1 columns],          dist
0    2.800250
1    5.730052
2    5.940892
3    1.390755
4    5.437398
..        ...
137  6.924861
138  4.386684
139  6.835064
140  3.671975
141  1.217251

[142 rows x 1 columns],          dist
0    4.178768
1    1.219098
2    0.886172
3    5.245045
4    1.945508
..        ...
137  0.790443
138  2.227846
139  0.659545
140  4.226263
141  7.692178

[142 rows x 1 columns],          dist
0    2.239040
1    2.915373
2    3.099823
3    2.783541
4    1.904468
..        ...
137  4.098975
138  1.899079
139  3.692398
140  1.126277
141  4.781213

[142 rows x 1 columns],          dist
0    4.422997
1    6.349945
2    6.255901
3    3.563215
4    5.391456
..        ...
137  7.437110
138  4.848845
139  7.119846
140  2.695422
141  3.381390

[142 rows x 1 columns],          dist
0    1.453375
1    2.711568
2    2.842622
3    1.904547
4    2.738722
..        ...
137  3.789591
138  1.255349
139  3.762419
140  2.374974
141  4.316086

[142 rows x 1 columns],          dist
0    2.450816
1    3.202905
2    3.322935
3    2.847262
4    2.140047
..        ...
137  4.348862
138  2.102499
139  3.936725
140  0.803306
141  4.709289

[142 rows x 1 columns],          dist
0    2.871028
1    2.917345
2    2.757789
3    3.255365
4    2.115065
..        ...
137  3.816504
138  1.540390
139  3.521633
140  1.218442
141  5.217404

[142 rows x 1 columns],          dist
0    2.561933
1    1.610776
2    1.773894
3    3.500329
4    0.878294
..        ...
137  2.716984
138  1.108648
139  2.331995
140  2.291615
141  5.825152

[142 rows x 1 columns],          dist
0    2.698500
1    1.949461
2    2.074199
3    3.580740
4    0.884081
..        ...
137  3.034238
138  1.505025
139  2.569514
140  2.157892
141  5.822628

[142 rows x 1 columns],          dist
0    3.492735
1    0.578360
2    1.045849
3    4.628239
4    1.479696
..        ...
137  1.157108
138  1.748485
139  0.924932
140  3.754757
141  7.082267

[142 rows x 1 columns],          dist
0    3.001700
1    0.561516
2    1.368722
3    4.188245
4    1.962320
..        ...
137  1.478749
138  1.716799
139  1.574897
140  3.973588
141  6.699694

[142 rows x 1 columns],          dist
0    4.414102
1    1.340000
2    1.544247
3    5.592039
4    1.892089
..        ...
137  0.987218
138  2.709299
139  0.215407
140  4.468926
141  8.033716

[142 rows x 1 columns],          dist
0    3.481638
1    1.927874
2    1.387227
3    4.245774
4    1.309695
..        ...
137  2.520853
138  1.417956
139  2.160579
140  2.629258
141  6.496853

[142 rows x 1 columns],          dist
0    3.928817
1    0.834446
2    1.121338
3    5.096332
4    1.968985
..        ...
137  0.674611
138  2.294602
139  0.627774
140  4.390467
141  7.587589

[142 rows x 1 columns],          dist
0    3.218167
1    1.479155
2    0.885776
3    4.095314
4    1.881090
..        ...
137  2.065163
138  1.465401
139  1.999925
140  3.515565
141  6.521695

[142 rows x 1 columns],          dist
0    3.710809
1    6.666258
2    6.872503
3    2.319504
4    6.373617
..        ...
137  7.841556
138  5.280615
139  7.771757
140  4.416028
141  0.437721

[142 rows x 1 columns],          dist
0    2.631957
1    2.423324
2    2.472529
3    3.326530
4    1.326461
..        ...
137  3.494295
138  1.558332
139  3.058742
140  1.550032
141  5.444272

[142 rows x 1 columns],          dist
0    2.557753
1    3.975299
2    4.135807
3    2.571711
4    2.991020
..        ...
137  5.142801
138  2.730733
139  4.768920
140  0.604401
141  4.043315

[142 rows x 1 columns],          dist
0    3.862978
1    0.850588
2    0.932202
3    4.994957
4    1.941778
..        ...
137  0.775951
138  2.170115
139  0.766616
140  4.297301
141  7.484444

[142 rows x 1 columns],          dist
0    2.249022
1    2.924893
2    3.293311
3    2.917413
4    1.868957
..        ...
137  4.142246
138  2.095734
139  3.700351
140  1.218893
141  4.868080

[142 rows x 1 columns],          dist
0    2.784834
1    3.394201
2    3.825846
3    3.394687
4    2.218513
..        ...
137  4.560636
138  2.710148
139  4.057117
140  1.358492
141  5.098372

[142 rows x 1 columns],          dist
0    2.694884
1    2.546939
2    2.414332
3    3.230635
4    1.710292
..        ...
137  3.505895
138  1.274520
139  3.180173
140  1.413719
141  5.327485

[142 rows x 1 columns]]


# Compare results with sklearn results
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
x_train, x_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, shuffle= True, random_state= 0)

# Setup and train the classifier
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(x_train, y_train)

# Run the prediction on the testing data
y_pred_sklearn = classifier.predict(x_test)
print(y_pred_sklearn)

[0 2 0 0 1 0 0 2 1 1 2 2 0 0 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0]


# Compare output from own function with that of sklearn
print(y_pred_scratch)
print(y_pred_sklearn)
print(np.array_equal(y_pred_scratch, y_pred_sklearn))

# As we can see below, the output corresponds to the sklearn output
# Thus we can assume that the funciton works as expected

[0, 2, 0, 0, 1, 0, 0, 2, 1, 1, 2, 2, 0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0]
[0 2 0 0 1 0 0 2 1 1 2 2 0 0 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0]
True


# Import the normalizer library
from sklearn import preprocessing

# Normalize the data, norm can be 'l1', 'l2' or 'max'
# I found that l2 normalization is Equivalent to np.sqrt((X * X).sum(axis=1))

sklearn_normalized_x_train = preprocessing.normalize(x_train, norm='l2')
sklearn_normalized_x_test = preprocessing.normalize(x_test, norm='l2')

# Normalize with own min_max_normalize function
self_normalized_x_train = min_max_normalize(x_train)
self_normalized_x_test = min_max_normalize(x_test)

# Print out the first rows of the x_train set to compare results
print(sklearn_normalized_x_train[0])
print(min_max_normalize(x_train)[0])

# Check eqaulity
print(np.array_equal(min_max_normalize(x_train)[0], sklearn_normalized_x_train[0]))

[0.9051436  0.0370256  0.38876876 0.16793753]
[0.7150537634408601, 0.027522935779816533, 0.39249146757679176, 0.6344086021505375]
False


# Lets see if normalizing the data gives us a better prediction accuracy
# Use normalized data from the min_max_normalize function
y_pred_scratch_norm = KNN_from_scratch(self_normalized_x_train, y_train, self_normalized_x_test, 5)

print(y_pred_scratch_norm)
print(y_pred_scratch)

# Check eqaulity
print("Are the outputs the same: " + str(np.array_equal(y_pred_scratch_norm, y_pred_scratch)))

# We can see that the result is slightly different between the normalized and not normalized x data
# Lets compare the aacuracy scores
print("Accuracy WITHOUT normalization: " + str(accuracy_score(y_test, y_pred_scratch)))
print("Accuracy WITH normalization: " + str(accuracy_score(y_test, y_pred_scratch_norm)))

# As we can see, we get better results when normalizing the data

([0, 2, 1, 0, 1, 1, 0, 2, 1, 1, 2, 2, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1], [         dist
0    0.585383
1    0.449954
2    0.590788
3    0.521628
4    0.414102
..        ...
137  0.619358
138  0.352507
139  0.482680
140  0.372315
141  0.623704

[142 rows x 1 columns],          dist
0    0.731211
1    0.856745
2    0.921953
3    0.574407
4    0.977947
..        ...
137  0.985643
138  0.782070
139  0.976676
140  0.986176
141  0.570307

[142 rows x 1 columns],          dist
0    0.806582
1    0.409078
2    0.242798
3    0.737921
4    0.569243
..        ...
137  0.311397
138  0.160872
139  0.299143
140  0.632610
141  0.948195

[142 rows x 1 columns],          dist
0    0.524637
1    0.427371
2    0.552662
3    0.443696
4    0.371680
..        ...
137  0.636077
138  0.375334
139  0.482641
140  0.426033
141  0.562920

[142 rows x 1 columns],          dist
0    0.713275
1    0.286641
2    0.383987
3    0.704164
4    0.489195
..        ...
137  0.275640
138  0.243182
139  0.184608
140  0.631084
141  0.921206

[142 rows x 1 columns],          dist
0    1.262744
1    1.021663
2    0.753302
3    1.184799
4    0.858620
..        ...
137  1.061640
138  0.963552
139  0.922724
140  0.978929
141  1.341694

[142 rows x 1 columns],          dist
0    0.437343
1    0.419399
2    0.645805
3    0.427457
4    0.280914
..        ...
137  0.701894
138  0.503008
139  0.504656
140  0.477760
141  0.542550

[142 rows x 1 columns],          dist
0    0.573271
1    0.561992
2    0.644371
3    0.443533
4    0.845025
..        ...
137  0.644086
138  0.526463
139  0.697857
140  0.962265
141  0.624117

[142 rows x 1 columns],          dist
0    0.739132
1    0.319566
2    0.296151
3    0.698575
4    0.673039
..        ...
137  0.092695
138  0.229196
139  0.289470
140  0.826729
141  0.948426

[142 rows x 1 columns],          dist
0    0.727535
1    0.321231
2    0.338589
3    0.715441
4    0.717811
..        ...
137  0.178511
138  0.401225
139  0.333225
140  0.963940
141  0.993843

[142 rows x 1 columns],          dist
0    0.267070
1    0.391542
2    0.693062
3    0.229282
4    0.711863
..        ...
137  0.598118
138  0.504124
139  0.601812
140  0.887341
141  0.461643

[142 rows x 1 columns],          dist
0    0.296136
1    0.460134
2    0.806939
3    0.334473
4    0.745355
..        ...
137  0.669906
138  0.586956
139  0.661319
140  0.900990
141  0.488862

[142 rows x 1 columns],          dist
0    0.656037
1    0.630589
2    0.753828
3    0.594455
4    0.418683
..        ...
137  0.838986
138  0.573452
139  0.660204
140  0.294465
141  0.611623

[142 rows x 1 columns],          dist
0    0.920857
1    0.529340
2    0.324590
3    0.849758
4    0.595603
..        ...
137  0.423867
138  0.278438
139  0.384072
140  0.585079
141  1.038509

[142 rows x 1 columns],          dist
0    0.385283
1    0.679664
2    0.880871
3    0.248829
4    0.790009
..        ...
137  0.916243
138  0.731562
139  0.853472
140  0.905561
141  0.233987

[142 rows x 1 columns],          dist
0    0.885620
1    0.446443
2    0.349783
3    0.853333
4    0.766195
..        ...
137  0.123911
138  0.346864
139  0.357778
140  0.902525
141  1.101949

[142 rows x 1 columns],          dist
0    0.481090
1    0.420663
2    0.582224
3    0.443974
4    0.245908
..        ...
137  0.690086
138  0.482403
139  0.484636
140  0.468909
141  0.575704

[142 rows x 1 columns],          dist
0    0.832017
1    0.841471
2    0.901776
3    0.741011
4    0.617880
..        ...
137  1.022645
138  0.739362
139  0.860738
140  0.390217
141  0.695545

[142 rows x 1 columns],          dist
0    0.607717
1    0.412961
2    0.530115
3    0.525242
4    0.751424
..        ...
137  0.406942
138  0.294090
139  0.510016
140  0.824548
141  0.724763

[142 rows x 1 columns],          dist
0    0.534216
1    0.452406
2    0.594841
3    0.488402
4    0.254045
..        ...
137  0.696615
138  0.465266
139  0.494257
140  0.386958
141  0.597007

[142 rows x 1 columns],          dist
0    0.887496
1    0.610674
2    0.624198
3    0.819830
4    0.724290
..        ...
137  0.547730
138  0.357817
139  0.561337
140  0.585748
141  0.946127

[142 rows x 1 columns],          dist
0    0.512957
1    0.187812
2    0.333112
3    0.477367
4    0.326679
..        ...
137  0.421947
138  0.275284
139  0.239611
140  0.591523
141  0.708831

[142 rows x 1 columns],          dist
0    0.541632
1    0.365198
2    0.459213
3    0.521890
4    0.248154
..        ...
137  0.619740
138  0.489906
139  0.399868
140  0.610567
141  0.722226

[142 rows x 1 columns],          dist
0    0.747745
1    0.343011
2    0.405076
3    0.722959
4    0.699172
..        ...
137  0.132862
138  0.265511
139  0.322634
140  0.829528
141  0.960409

[142 rows x 1 columns],          dist
0    0.611441
1    0.276285
2    0.413859
3    0.593917
4    0.719620
..        ...
137  0.237304
138  0.372545
139  0.378609
140  0.948315
141  0.868465

[142 rows x 1 columns],          dist
0    0.763477
1    0.297040
2    0.318226
3    0.756464
4    0.606615
..        ...
137  0.119271
138  0.303848
139  0.194839
140  0.802391
141  1.008320

[142 rows x 1 columns],          dist
0    0.782726
1    0.362053
2    0.168028
3    0.726480
4    0.495011
..        ...
137  0.305047
138  0.190982
139  0.220062
140  0.618466
141  0.948093

[142 rows x 1 columns],          dist
0    0.721697
1    0.294266
2    0.304062
3    0.707638
4    0.679568
..        ...
137  0.143489
138  0.362471
139  0.288125
140  0.919462
141  0.982610

[142 rows x 1 columns],          dist
0    0.773118
1    0.502326
2    0.372715
3    0.734002
4    0.684119
..        ...
137  0.556812
138  0.596218
139  0.509201
140  0.993089
141  0.996431

[142 rows x 1 columns],          dist
0    0.530837
1    0.787716
2    0.994033
3    0.408585
4    0.931301
..        ...
137  0.971539
138  0.770154
139  0.952538
140  0.941205
141  0.301555

[142 rows x 1 columns],          dist
0    0.549740
1    0.339109
2    0.441289
3    0.508412
4    0.207971
..        ...
137  0.569037
138  0.370401
139  0.353231
140  0.457526
141  0.684290

[142 rows x 1 columns],          dist
0    0.603519
1    0.574674
2    0.740395
3    0.549186
4    0.450977
..        ...
137  0.775410
138  0.515300
139  0.621760
140  0.345814
141  0.578374

[142 rows x 1 columns],          dist
0    0.732843
1    0.313232
2    0.274053
3    0.713157
4    0.679578
..        ...
137  0.187420
138  0.379997
139  0.301092
140  0.929450
141  0.990840

[142 rows x 1 columns],          dist
0    0.493801
1    0.462433
2    0.684545
3    0.476035
4    0.340874
..        ...
137  0.712079
138  0.492419
139  0.532155
140  0.416574
141  0.558183

[142 rows x 1 columns],          dist
0    0.685131
1    0.669102
2    0.883975
3    0.680707
4    0.534125
..        ...
137  0.865480
138  0.650156
139  0.709710
140  0.421499
141  0.689086

[142 rows x 1 columns],          dist
0    0.702423
1    0.392625
2    0.417452
3    0.635175
4    0.494477
..        ...
137  0.422694
138  0.149825
139  0.350694
140  0.485622
141  0.803069

[142 rows x 1 columns]])
[0, 2, 0, 0, 1, 0, 0, 2, 1, 1, 2, 2, 0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0]
Are the outputs the same: False
Accuracy WITHOUT normalization: 0.8611111111111112

C:\Users\SJHen\anaconda3\lib\site-packages\numpy\core\_asarray.py:102: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  return array(a, dtype, copy=False, order=order)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_18904/994880705.py in <module>
     12 # Lets compare the aacuracy scores
     13 print("Accuracy WITHOUT normalization: " + str(accuracy_score(y_test, y_pred_scratch)))
---> 14 print("Accuracy WITH normalization: " + str(accuracy_score(y_test, y_pred_scratch_norm)))
     15 
     16 # As we can see, we get better results when normalizing the data

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in accuracy_score(y_true, y_pred, normalize, sample_weight)
    200 
    201     # Compute accuracy for each possible representation
--> 202     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    203     check_consistent_length(y_true, y_pred, sample_weight)
    204     if y_type.startswith('multilabel'):

~\anaconda3\lib\site-packages\sklearn\metrics\_classification.py in _check_targets(y_true, y_pred)
     81     y_pred : array or indicator matrix
     82     """
---> 83     check_consistent_length(y_true, y_pred)
     84     type_true = type_of_target(y_true)
     85     type_pred = type_of_target(y_pred)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    317     uniques = np.unique(lengths)
    318     if len(uniques) > 1:
--> 319         raise ValueError("Found input variables with inconsistent numbers of"
    320                          " samples: %r" % [int(l) for l in lengths])
    321 

ValueError: Found input variables with inconsistent numbers of samples: [36, 2]


# Lest also setup the sklearn library with the use of normalized data
# Setup and train the classifier with normalized data
classifierNorm = KNeighborsClassifier(n_neighbors = 5)
classifierNorm.fit(self_normalized_x_train, y_train)

# Run the prediction on the testing data
y_pred_sklearn_norm = classifierNorm.predict(self_normalized_x_test)
print(y_pred_sklearn_norm)

# Compare with own classifier output
print("Are they the same: " + str(np.array_equal(y_pred_scratch_norm, y_pred_sklearn_norm)))

# As can be seen below, the same results are porduced from the sklearn library and the scratch function when using normalized data


# confusion matrix, accuracy, precision, recall, etc. 

# accuracy funcion
def accuracy_score_scratch(y_test, y_pred):
    """
    Input:
        -y_test: the correct set of labels.
        -y_pred: the predicted set of labels.

    Output:
        -final_score: the score calculated on the number of correct predictions, range 0 (all wrong) - 1 (all correct).

    """
    # Create empty list for pushing in the comparisons
    accuracy_score = 0
    
    # Check both lists for equal length
    if len(y_test) == len(y_pred):
        
        # Itterate over the items in the list and compare
        for i in range(len(y_test)):
            if y_test[i] == y_pred[i]:
                accuracy_score += 1
                
    # Lists are not of equal lenghts
    else:
        print("Lists are not of equal length")
        return None

    # calculate the final score
    final_score = accuracy_score/len(y_test)
    
    return final_score


# test accuracy function
print("Scrath accuracy Score: " + str(accuracy_score_scratch(y_test, y_pred_scratch_norm)))

# Lets just use the sklearn accuracy score function to see what we should be getting from our own implemented funciton
print("SkLearn accuray Score:  " + str(accuracy_score(y_test, y_pred_sklearn_norm)))

# Compare the outputs from the two functions to see if they are equal
print(np.array_equal(accuracy_score_scratch(y_test, y_pred_scratch_norm), accuracy_score(y_test, y_pred_sklearn_norm)))


# confusion matrix function
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

matrix = confusion_matrix((y_test), (y_pred_scratch_norm))
class_names = ['0','1','2']

fig, ax = plot_confusion_matrix(conf_mat=matrix, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)

plt.title('Confusion Matrix')
plt.show()


def confusion_matrix_scratch(y_test, y_pred):
    
    # Initialize a 3 x 3 matrix
    matrixArray = np.zeros((3,3))
    
    # Loop over the given input lists
    for i in range(len(y_test)):
        # Count correct values based on bin positions, using the label in the list as the bin position.
        # Rows (y_test) are the correct labels
        # Columns (y_pred) are the predicted labels
        matrixArray[y_test[i]][y_pred[i]] += 1
        
    return matrixArray

matrix_scratch = confusion_matrix_scratch(y_test, y_pred_scratch_norm)
print(matrix_scratch)


def plot_confusion_matrix_scratch(conf_matrix, class_names):
    # Setup figure and subplots
    fig1, ax1 = plt.subplots()

    # Set plot title
    plt.title('Confusion matrix', fontsize=14)

    # Make use of the matplotlib imshow plot
    im = ax1.imshow(conf_matrix, cmap='Blues')

    # Show matrix values on the plot
    for (i, j), z in np.ndenumerate(conf_matrix):
        # Print the diagonal values in white
        if i == j:
            ax1.text(j, i, '{:0.1f}'.format(z), ha='center', va='center', color='white', fontsize=12)
        else:
            ax1.text(j, i, '{:0.1f}'.format(z), ha='center', va='center', color='black', fontsize=12)

    # Show matrix values on the plot
    ax1.set_xticks(class_names)
    ax1.set_yticks(class_names)
    ax1.set_xlabel("Predicted label")
    ax1.set_ylabel("True label")

    # Add colour bar to the side
    cb = fig1.colorbar(im, ax=ax1)


# Lest call the function and see what we get
class_names = [0, 1, 2]
plot_confusion_matrix_scratch(matrix_scratch, class_names)


from sklearn.metrics import precision_score

# Filter options for multiclass targets: [None, 'micro', 'macro', 'weighted']
precisionMicro = precision_score(y_test, y_pred_scratch_norm, average='micro')
print("Micro Precision score: " + str(precisionMicro))

# This looks the same as the accuracy score, lets review
print("SkLearn accuray Score:  " + str(accuracy_score(y_test, y_pred_sklearn_norm)))


precisionMacro = precision_score(y_test, y_pred_scratch_norm, average='macro')
print("Macro Precision score: " + str(precisionMacro))


precisionNone = precision_score(y_test, y_pred_scratch_norm, average=None)
print("None - Precision score: " + str(precisionNone))


def precision_score_scratch(y_test, y_pred):
    
    # Initialize a matrix based on the number of labels
    num_labels = []

    for lbl1 in y_test:
        for lbl2 in y_pred:
            if lbl1 not in num_labels:
                num_labels.append(lbl1)
            if lbl2 not in num_labels:
                num_labels.append(lbl2)
    
    # Add one and / or zero if 2 is in labels and 0 or 1 is not
    if 2 in num_labels and 1 not in num_labels:
        num_labels.append(1)
    if 2 in num_labels and 0 not in num_labels:
        num_labels.append(0)
    
    # Sort list
    num_labels.sort()
    #print("Labels: " + str(num_labels))
    
    # initialize matrix based on the number of labels
    mtrx = np.zeros((len(num_labels),len(num_labels)))
    #print("Matrix" + str(mtrx))
    
    # Create matrix of the target values
    for i in range(len(y_test)):
        mtrx[y_test[i]][y_pred[i]] += 1
    #print("Matrix" + str(mtrx))
    
    # Create dict to store precision scores per label
    macro_precision = {}
    macro_precision_score = 0
    
    # Loop over matrix and calculate the precision score per label based on the columns
    for i in range(len(mtrx)):
        
        # Code for a 2 x 2 matrix
        if len(num_labels) == 2:
            # Calc score for each column and add to dict
            if (mtrx[0][i] + mtrx[1][i]) == 0:
                #print(str(mtrx[0][i]) + " + " + str(mtrx[1][i]))
                #print("can't divide by zero, set this point equal to 0")
                macro_precision[i] = 0
            else:
                macro_precision[i] = mtrx[i][i] / (mtrx[0][i] + mtrx[1][i])
            #print(macro_precision[i]) # Print the score per column to verify calculation
            macro_precision_score += macro_precision[i]
        
        # Code for a 3 x 3 matrix
        elif len(num_labels) == 3:
            # Calc score for each column and add to dict
            if (mtrx[0][i] + mtrx[1][i] + mtrx[2][i]) == 0:
                #print("can't divide by zero")
                macro_precision[i] = 0
            else:
                macro_precision[i] = mtrx[i][i] / (mtrx[0][i] + mtrx[1][i] + mtrx[2][i])
            #print(macro_precision[i]) # Print the score per column to verify calculation
            macro_precision_score += macro_precision[i]
    
    # Get the avg score accross all labels
    macro_precision_score = macro_precision_score / len(mtrx)
    
    return macro_precision_score


# Call the function
precision_scratch = precision_score_scratch(y_test, y_pred_scratch_norm)
print("Macro - Precision calculation with implemented function: " + str(precision_scratch))


print("The precision scores are the same:  " + str(np.array_equal(precisionMacro, precision_scratch)))


from sklearn.metrics import recall_score

recallMacro = recall_score(y_test, y_pred_scratch_norm, average='macro')
print("Recall score: " + str(recallMacro))


recallNone = recall_score(y_test, y_pred_scratch_norm, average=None)
print("Recall score: " + str(recallNone))


def recall_score_scratch(y_test, y_pred):
    
    # Initialize a matrix based on the number of labels
    num_labels = []
    for lbl1 in y_test:
        for lbl2 in y_pred:
            if lbl1 not in num_labels:
                num_labels.append(lbl1)
            if lbl2 not in num_labels:
                num_labels.append(lbl2)
    
    # Add one and / or zero if 2 is in labels and 0 or 1 is not
    if 2 in num_labels and 1 not in num_labels:
        num_labels.append(1)
    if 2 in num_labels and 0 not in num_labels:
        num_labels.append(0)
    
    # Sort list
    num_labels.sort()
    #print("Labels: " + str(num_labels))
    
    # initialize matrix based on the number of labels
    mtrx = np.zeros((len(num_labels),len(num_labels)))
    
    # Create matrix of the target values
    for i in range(len(y_test)):
        mtrx[y_test[i]][y_pred[i]] += 1
    #print(mtrx)
    
    # Create dict to store precision scores per label
    macro_recall = {}
    macro_recall_score = 0
    
    # Loop over matrix and calculate the precision score per label based on the columns
    for i in range(len(mtrx)):
        
        # Code for a 2 x 2 matrix
        if len(num_labels) == 2:
            # Calc score for each row and add to dict
            if (mtrx[i][0] + mtrx[i][1]) == 0:
                #print("can't divide by zero, set this point equal to 0")
                macro_recall[i] = 0
            else:
                macro_recall[i] = mtrx[i][i] / (mtrx[i][0] + mtrx[i][1])
            # print(macro_recall[i]) # Print the score per row to verify calculation
            macro_recall_score += macro_recall[i]
        
        # Code for a 3 x 3 matrix
        elif len(num_labels) == 3:
            # Calc score for each column and add to dict
            if (mtrx[i][0] + mtrx[i][1] + mtrx[i][2]) == 0:
                #print("can't divide by zero")
                macro_recall[i] = 0
            else:
                macro_recall[i] = mtrx[i][i] / (mtrx[i][0] + mtrx[i][1] + mtrx[i][2])
            #print(macro_recall[i]) # Print the score per row to verify calculation
            macro_recall_score += macro_recall[i]
            
        
    # Get the avg score accross all labels
    macro_recall_score = macro_recall_score / len(mtrx)
    
    return macro_recall_score


# Call the function
recall_scratch = recall_score_scratch(y_test, y_pred_scratch_norm)
print("Macro - Recall calculation with implemented function: " + str(recall_scratch))


print("The recall scores are the same:  " + str(np.array_equal(recallMacro, recall_scratch)))


# Import the KFold library
from sklearn.model_selection import KFold
import copy

# myNestedCrossVal code
def myNestedCrossVal(chosen_data, chosen_target, num_splits, k_values):
    
    # Call the K Fold function
    kf = KFold(n_splits = num_splits)
    
    # Initialize a matrix for the accuracy, precision, recall and confusion matrix data
    accuracy_matrix = {}
    precision_matrix = {}
    recall_matrix = {}
    confusion_matrices = {}

    # Loop over the K values
    for k in k_values:
        # Keep track of the fold index
        fold_index = 0
        
        # Variables for calculating the average
        accuracy_fold = 0
        precision_fold = 0
        recall_fold = 0
        
        # Data matrices - initialize matrix with k index and an array of fold + 1 lenght (+1 for the avg column)
        accuracy_matrix["k-"+str(k)] = np.zeros(num_splits+1)
        precision_matrix["k-"+str(k)] = np.zeros(num_splits+1)
        recall_matrix["k-"+str(k)] = np.zeros(num_splits+1)
        
        # Loop over the splits
        for train_fold_index, test_fold_index in kf.split(chosen_data):
            
            # Look and validate that the splits are as expected
            #print(x_train_fold_idx)
            #print(x_valid_fold_idx)

            # fetch the values - Training set
            x_train_fold = chosen_data[train_fold_index] 
            y_train_fold = chosen_target[train_fold_index]

            # fetch the values - Testing set
            x_test_fold = chosen_data[test_fold_index]
            y_test_fold = chosen_target[test_fold_index]
            
            # Normalize with sklearn
            x_train_fold_norm = preprocessing.normalize(x_train_fold, norm='l2')
            x_test_fold_norm = preprocessing.normalize(x_test_fold, norm='l2')
            
            # run the prediction on the chosen sets without normalizing
            #y_pred_fold = KNN_from_scratch(x_train_fold, y_train_fold, x_test_fold, k)
            
            # run the prediction with normalized data (Returning better results)
            y_pred_fold = KNN_from_scratch(x_train_fold_norm, y_train_fold, x_test_fold_norm, k)
            
            #------------------------------------------------------------------------------------------
            # Performance stats
            
            # Accuracy
            accuracy_matrix["k-"+str(k)][fold_index] = accuracy_score_scratch(y_pred_fold, y_test_fold)
            accuracy_fold += accuracy_matrix["k-"+str(k)][fold_index]

            # Precision
            precision_matrix["k-"+str(k)][fold_index] = precision_score_scratch(y_test_fold, y_pred_fold)
            precision_fold += precision_matrix["k-"+str(k)][fold_index]
            
            # Recall
            recall_matrix["k-"+str(k)][fold_index] = recall_score_scratch(y_test_fold, y_pred_fold)
            recall_fold += recall_matrix["k-"+str(k)][fold_index]
            
            #------------------------------------------------------------------------------------------
            
            # Keep track of the itteration
            fold_index += 1

        # Get the average. Accuracy_fold / n_splits
        accuracy_avg = accuracy_fold / num_splits
        precision_avg = precision_fold / num_splits
        recall_avg = recall_fold / num_splits
        
        # Add the avg score to the last column of the current row
        accuracy_matrix["k-"+str(k)][num_splits] = accuracy_avg
        precision_matrix["k-"+str(k)][num_splits] = precision_avg
        recall_matrix["k-"+str(k)][num_splits] = recall_avg
        
    # save the accuracy data in a dataframe
    columns = []
    # create column names for the df
    for splt in range(num_splits):
        columns.append("fold-" + str(splt))
        
    # add the average column to the end of the dataframe
    columns.append("avg")

    # set the dataframe index (same for all dataframes)
    index = accuracy_matrix.keys()
    
    # ceate dataframe for accuracy data, the precision data
    accuracy_df = pd.DataFrame(accuracy_matrix.values(), columns=columns, index=index)
    precision_df = pd.DataFrame(precision_matrix.values(), columns=columns, index=index)
    recall_df = pd.DataFrame(recall_matrix.values(), columns=columns, index=index)

    # return the average accuracies
    return accuracy_df, precision_df, recall_df

# References:

# https://deepnote.com/@ndungu/Implementing-KNN-Algorithm-on-the-Iris-Dataset-58Fkk1AMQki-VJOJ3mA_Fg
# By John Ndungu - Sep 23, 2021

# https://www.codecademy.com/courses/machine-learning/articles/training-set-vs-validation-set-vs-test-set
# Training set vs validation set vs test set
# N-Fold Cross-Validation


# Call the function and print the results
n_splits = 5 # Give a 80% training data and 20% testing data split
k_values = list(range(1,11))

# Run funcion with the wine dataset
accuracy_dataFrame, precision_dataFrame, recall_dataFrame = myNestedCrossVal(X, y, n_splits, k_values)


# Print the accuracy dataframe
print("Accuracy Dataframe")
print("-----------------------------------------------------------------")
print(accuracy_dataFrame)
print("-----------------------------------------------------------------")
print()

# Print the best result
print("Best Accuracy Avg:")
print("-----------------------------------------------------------------")
print(accuracy_dataFrame.loc[accuracy_dataFrame['avg'].idxmax()])
print("-----------------------------------------------------------------")
print()


# Print the precision matrix
print("Precision Matrix")
print("-----------------------------------------------------------------")
print(precision_dataFrame)
print("-----------------------------------------------------------------")
print()

# Print the best result
print("Best Precision Avg:")
print("-----------------------------------------------------------------")
print(precision_dataFrame.loc[precision_dataFrame['avg'].idxmax()])
print("-----------------------------------------------------------------")
print()


# Print the recall matrix
print("Recall Matrix")
print("-----------------------------------------------------------------")
print(recall_dataFrame)
print("-----------------------------------------------------------------")
print()

# Print the best result
print("Best Recall Avg:")
print("-----------------------------------------------------------------")
print(recall_dataFrame.loc[recall_dataFrame['avg'].idxmax()])
print("-----------------------------------------------------------------")
print()


# noise code
mySeed = 12345 
np.random.seed(mySeed) 
NoisyData = X + np.random.normal(0, 0.6, X.shape)

# Test on the noise code
accuracy_dataFrame_noise, precision_dataFrame_noise, recall_dataFrame_noise = myNestedCrossVal(NoisyData, y, n_splits, k_values)


# Print the accuracy dataframe
print("Accuracy Dataframe - Noisy data")
print("-----------------------------------------------------------------")
print(accuracy_dataFrame_noise)
print("-----------------------------------------------------------------")
print()

# Print the best result
print("Best Accuracy Avg - Noisy data:")
print("-----------------------------------------------------------------")
print(accuracy_dataFrame_noise.loc[accuracy_dataFrame_noise['avg'].idxmax()])
print("-----------------------------------------------------------------")
print()


# Print the precision matrix
print("Precision Matrix - Noisy data")
print("-----------------------------------------------------------------")
print(precision_dataFrame_noise)
print("-----------------------------------------------------------------")
print()

# Print the best result
print("Best Precision Avg - Noisy data:")
print("-----------------------------------------------------------------")
print(precision_dataFrame_noise.loc[precision_dataFrame_noise['avg'].idxmax()])
print("-----------------------------------------------------------------")
print()


# Print the recall matrix
print("Recall Matrix - Noisy data")
print("-----------------------------------------------------------------")
print(recall_dataFrame_noise)
print("-----------------------------------------------------------------")
print()

# Print the best result
print("Best Recall Avg - Noisy data:")
print("-----------------------------------------------------------------")
print(recall_dataFrame_noise.loc[recall_dataFrame_noise['avg'].idxmax()])
print("-----------------------------------------------------------------")
print()


print('CLEAN')
# clean data summary results



print('NOISY')
# clean data summary results


# print(wine.DESCR)


# print(iris.DESCR)


# cancer = datasets.load_breast_cancer()
# print(cancer.DESCR)

Fold	accuracy	k	distance
1	.833333	1, 2, 5, 9 & 10	?
2	.861111	1 - 4	?
3	.750000	1, 2 & 4	?
4	.885714	4	?
5	.942857	1 & 2	?
total	.854603 $\pm$ 0.06355032244

Fold	accuracy	k	distance
1	.555556	1 & 2	?
2	.722222	3, 4 & 6	?
3	.583333	1, 2, 6, 7 & 8	?
4	.742857	4	?
5	.600000	3	?
total	.6407936 $\pm$ 0.07652294918

1. Exploratory Data Analysis¶

1.1. Visualising the data¶

Manual Grid plot implementation¶

Seaborn implementation¶

References¶

1.2. Exploratory Data Analysis under noise¶

Data with added noise¶

Q1. Exploratory data analysis¶

Answer:¶

Q2. Data with noise¶

Answer:¶

2. Implementing kNN¶

Helper functions¶

References:¶

Step 1¶

Step 2¶

Step 3¶

KNN function¶

Normalizer function¶

Run the function¶

Sklearn¶

Compare the data¶

Output vs sklearn¶

References:¶

Normalized data¶

Normalizing performance¶

Sklearn validation¶

3. Classifier evaluation¶

Accuracy score¶

Run the accuracy function¶

Accuracy conclusion¶

Confusion Matrix.¶

Confusion matrix from scratch¶

Precision score¶

Precision for Multi-Class Classification¶

Example:¶

Reference:¶

sklearn library¶

Reference:¶

Recall score¶

sklearn library¶

Reference:¶

4. Nested Cross-validation using your implementation of KNN¶

Nested cross validation implementation¶

References:¶

Run the N-Fold function on the wine data¶

N-Fold function with noise induced¶

Results¶

5. Summary of results¶

5.2. Confusion matrix summary¶

6. More questions¶

Q3. Influence of noise¶

Answer:¶

Q4. Tie break¶

Answer:¶

Q5. Beyond Wine¶

Answer:¶