Click here to Skip to main content
15,999,481 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more: , +
Below is the example for classification in which we take both
X = dataset.iloc[:,[2,3]].values
y = dataset.iloc[:,4].values 


Where as in clustering we are only taking
X = dataset.iloc[:,[3,4]].values



Full Code for reference purpose

Python
# Decision Tree Classification

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:,[2,3]].values
y = dataset.iloc[:,4].values 

# Splitting dataset into training and test set

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

#Fitting classifier into training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)

#predicting the result
y_pred = classifier.predict(X_test)

#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

#visulaising the training set
from matplotlib.colors import ListedColormap
X_set,y_set = X_train,y_train 
X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,stop = X_set[:,0].max()+1,step=0.01 ),
                    np.arange(start = X_set[:,1].min()-1,stop=X_set[:,1].max()+1,step=0.01)
                    )
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
             alpha=0.75,cmap=ListedColormap(('red','green')))

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0],X_set[y_set == j,1],
                c=['blue','black'][i],label=j)

plt.legend()
plt.title('decision_tree(Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated_salary')
plt.show()

#Visualising the test set
from matplotlib.colors import ListedColormap
X_set,y_set = X_test,y_test
X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,stop = X_set[:,0].max()+1,step=0.01),
                    np.arange(start = X_set[:,1].min()-1,stop = X_set[:,1].max()+1,step=0.01)
                    )
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
             alpha=0.75,cmap=ListedColormap(('blue','black')))

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0],X_set[y_set==j,1],
                c=['red','green'][i],label=j)
    
plt.legend()
plt.title('decision_tree(Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated_salary')
plt.show()



# K-Means Clustering

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing the dataset
dataset = pd.read_csv("mall_customers.csv")
X = dataset.iloc[:,[3,4]].values

#using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = [] #Within-Cluster Sum of Square

for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300,n_init=10,random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_) #kmeans.inertia_ computes he wcss. later we append it with wcss

plt.plot(range(1,11),wcss)
plt.title("The elbow method")
plt.xlabel("Number of cluster")
plt.ylabel('Wcss') 
plt.show()   

#Applying kmeans to the mall dataset
kmeans = KMeans(n_clusters = 5,init = 'k-means++',max_iter = 300,n_init = 10,random_state=0)
y_kmeans = kmeans.fit_predict(X)
#for every single client of our dataset the fit_predict is going to tell the cluster to Which client belongs.
#and it'll return it's cluster numbers into a single vector that we'll call y_kmeans

# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
#[y_kmeans == 0] meaning we want the observation that belongs to cluster 1 [y_kmeans == 0,0] meaning we want 1st column of our data X
#X[y_kmeans == 0, 0] by doing this we gave x coordinates of all the observation points that belongs to cluster 1. 
#X[y_kmeans == 0, 1] here by changing 0 to 1 our dataset corresponds to the second column of our data X that is the y coodinate
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
#it is same as above but is use to plot centeroid
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()


What I have tried:

I read many articles but none clarifies my doubts.
Posted
Updated 20-May-21 5:51am
v2

1 solution

There are many different types of algorithms in machine learning. Among others, you have supervised, unsupervised and reinforcement learning tasks.
Each of these types have different input and output requirements.
Classification and regression are SUPERVISED tasks, clustering is UNSUPERVISED one.

Supervised tasks require examples to learn from, so each data point (X) needs to have a corresponding label (y) during training. It learns how to generate y output for each X input.

An unsupervised task, such as clustering, doesn't need labels (so y is not required), because it's goal is to learn how to group similar items together on its own (without "supervision").
 
Share this answer
 

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900