-------------------------------- CUSTOMER SEGMENTATION -------------------------------------

Libraries

In [ ]:
#for i in tqdm(range(10)):
from tqdm import *

#matrix math
import numpy as np

#Allows to vizualize data
import matplotlib.pyplot as plt

#Used to save smaller parts of data in pickle files
import pickle

#Used to create sparse matrix
import scipy
from scipy.sparse import csr_matrix

import pandas as pd

#Garbage Colector
#import gc

#k-means
from sklearn.cluster import KMeans

#Distancia euclideana
#from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances

#Multidimensional Scaling
from sklearn.manifold import MDS

from random import randint

from scipy.sparse import vstack

Auxiliar functions:

In [ ]:
#load pickle
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
In [ ]:
#save dict in a pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    # glb_pack_list=glb_pack_list + pkgs

Project

In [ ]:
"""    MAIN FUNCTION     """
#Import dataset of user -packages, each line contain the user id followed by the packages he has installed
    # format: "id" "packages"
"dataset=import_file_as_array(r'/home/pedros/pCloudDrive/Projeto/K-means project/dataset.csv')"

#split the packages string into an array
    #format: "id" ["package1" , "package2", ... , "packagen"]
"dataset=spli_packages(file)"

"dataset=create_sparse_matrix(dataset) #Create the sparse matrix"
'save_obj(dataset,"sparse_matrix")'

#constant vars
'SAMPLE_SIZE=3000'
'N=3'
#Create N Euclidean MAtrixes from three random samples 
'Euclidean_matrix(sparse_matrix, MATRIX_SIZE, N)'

#Load Euclidean MAtrix
'matrix_name="euclidean_matrix1"'
'euclidean_matrix=load_obj(matrix_name)'

#elbow=get_MDS_elbow(euclidean_matrix,0,euclidean_matrix.shape[0])
'elbow=217'

'mds_matrix=MultidimensionalSC(elbow)'

#n_clusters=kmeans_elbow(mds_matrix)  #A melhorar, grande probabilidade de erro
'n_clusters=6'
'k_means(n_clusters,mds_matrix)'

kmeans_matrix=load_obj("kmeans_matrix")

-------------------------------------A. Pre-processing----------------------------------------------

A.1. Data Gathering

Import Data
In [ ]:
def import_file_as_array(location):
    """Import file, split the file per lines and save in a array"""
    with open(location) as f:
        lines = f.read().splitlines()
    lines = lines [1:]
    return lines

A.2. Feature Extraction

Convert Data
In [ ]:
def spli_packages(lista):
    """Input: range_min- index of the first line of the array lista[] to be splitted
              range_max- index of the last line of the array lista[] to be splitted
              name- name of the pickle in which data will be saved
              lista- name of the array to be splitted"""
    #dicionario em que vão ser guardados os id e os respetivos packages
    df = []
    
    #iterate the array lista form range_min to range_max
    for line in lista:
        #split the line into: uid - id of the client; pkgs - string of all packages installed by the uid user 
        uid, pkgs = line.split(',',1)
        #split the string into a list of packages installed by the uid user
        pkgs = pkgs.replace('"','').split(',')
        #save the array of packages which belong to the uid user, using the glb index
        df.append(pkgs)
    
    #save the created dictionary in a pickle called "name" 
    return df
User-App Matrix M:
In [ ]:
def create_sparse_matrix(info):
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for d in info:
        for term in d:
            index = vocabulary.setdefault(term, len(vocabulary))
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))

    sparse_matrix= csr_matrix((data, indices, indptr), dtype=int)
    return sparse_matrix

A.3. Feature Transformation:

Euclidean distance
In [ ]:
def Euclidean_matrix(sparse_matrix, size, num)
"""size = size of the sample we will use
    num = number of samples we will create"""

    SAMPLE_SIZE=num*size
    MATRIX_SIZE=1000

    #generate 30k different random values
    rnd = np.random.choice(sparse_matrix.shape[0],size=SAMPLE_SIZE,replace=False)

    #Create three different euclidean matrixs of 10k different users
    for i in range (0,3):

        #select 10k sparse arrays (users)
        indexs=rnd[i*MATRIX_SIZE:i*MATRIX_SIZE+MATRIX_SIZE]
        lines=[sparse_matrix[index] for index in indexs]

        sub_set=None

        #Concatenate sparse arrys
        for line in lines:
            if sub_set is None:
                sub_set=line
            else:
                sub_set=vstack([sub_set, line])

        #Euclidean distance, maximum of line 10k
        similarities = euclidean_distances(sub_set)

        #Save Euclidean Matrix
        name="euclidean_matrix"+str(i)
        save_obj(similarities, name)

B. Analytical Processing

B.1. Multidimensional Scaling

In [ ]:
def get_MDS_elbow(euclidean_matrix,min_val,max_val):
    stress=[]
    seed = np.random.RandomState(seed=3)
    for i in tqdm(range(min_val,max_val)):
        mds = MDS(n_components=i, max_iter=300, eps=1e-6, random_state=seed, dissimilarity="precomputed", n_jobs=1)
        stress.append(mds.fit(euclidean_matrix).stress_)
    elbow=elbow(stress)
    save_obj(stress, "stress")
    return elbow

##AUxiliar Functions:

#Discover elbow
def elbow(x):
    max_val=[0,0]
    for i in range(1,len(x)-1):
        sin_m1=(x[i+1]-x[i])/(((x[i+1]-x[i])**2 + 1)**(1/2))
        sin_p1=(x[i]-x[i-1])/(((x[i]-x[i-1])**2 + 1)**(1/2))
        dsin=sin_m1-sin_p1
        if  dsin>max_val[1]:
            max_val[1]= dsin
            max_val[0]=i
    return max_val[0]       
    
In [ ]:
#Multidimensional Scaling com o elbow descoberto
def MultidimensionalSC(elbow)
    seed = np.random.RandomState(seed=3)
    mds = MDS(elbow, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=-1)
    mds_matrix=mds.fit(euclidean_matrix).embedding_
    return mds_matrix

B.2. K-means

In [ ]:
def kmeans_elbow(mds_matrix):
    inertia=[]
    inertia.append(0)
    normalizer = KMeans(20, random_state=0).fit(mds_matrix).inertia_
    for i in range(1,20):
        cluster = KMeans(i,random_state=0)
        inertia.append(cluster.fit(mds_matrix).inertia_ - normalizer)
    elbow=elbow_kmeans_aux(inertia)
    return elbow

##Auxiliar Functions
def elbow_kmeans_aux(x):
    max_val=[0,0,0]
    for i in range(2,len(x)-2):
        sin_m1=(x[i+1]-x[i])/(((x[i+1]-x[i])**2 + 1)**(1/2))
        sin_m2=(x[i+2]-x[i])/(((x[i+2]-x[i])**2 + 1)**(1/2))
        sin_p1=(x[i]-x[i-1])/(((x[i]-x[i-1])**2 + 1)**(1/2))
        sin_p2=(x[i]-x[i-2])/(((x[i]-x[i-2])**2 + 1)**(1/2))
        dsin=sin_m1-sin_p1
        dsin2=sin_m2-sin_p2
        if  (dsin>max_val[1] and x[i]<x[i-1] and dsin2>max_val[2]):
            print(i)
            max_val[1]= dsin
            max_val[2]= dsin2
            max_val[0]=i
    return max_val[0]
In [ ]:
#Choose k=6
def k_means(n_clusters,mds_matrix):    
    kmeans=KMeans(n_clusters, random_state=0,n_jobs=-1)
    kmeans_matrix=kmeans.fit_predict(mds_matrix)
    save_obj(kmeans_matrix,"kmeans_matrix")
    return kmeans_matrix

------------------------------------------ Vizualization: -------------------------------------------------

User-Packs Matrix: Histograms

 Pre-pocessing (Only used to vizualize data):
In [ ]:
#Optional, visulaize data, one package per line
df_h=df.copy()
rows = []
for i, row in df_h.iterrows():
    for a in row.packages:
        rows.append([row.id,a])

extended_df=pd.DataFrame(rows, columns=['id','packages'])
extended_df
In [ ]:
id_count=len(extended_df["id"].drop_duplicates())
packages_count=b=len(extended_df["packages"].drop_duplicates())
print("clients:", id_count)
print("packages:", packages_count)
print("resulting matrix:", id_count*packages_count)
Number of packages per client
In [ ]:
#Get number of packages per client
df1=extended_df.copy()
df_id=df1.groupby(['id']).size().reset_index().rename(columns={0:'count'})
apps=df_id['count'].copy()

count_max=apps.max()

#definition of the  range of the histogram
bins=[x for x in range(0, 600, 10)]
#creation of the histogram
plt.hist(apps, bins, histtype="bar",rwidth=0.9)

plt.xlabel('number of apps')
plt.ylabel('clients')
plt.title('Distribution of clients (300 x 6000)')
plt2.axis([0, 300, 0, 6000])
plt.grid(True)

plt.show()

#smaller histogram, in order to see more data
bins2=[x for x in range(0, 600, 10)]

plt2.hist(apps,bins2, histtype="bar",rwidth=0.9)

plt2.xlabel('number of apps')
plt2.ylabel('clients')
plt2.title('Distribution of clients (300 x 100)')
plt2.axis([0, 300, 0, 100])
plt2.grid(True)

plt2.show()
"Em média, 1500 clientes diferentes instalaram 10 ou menos aps diferente"
In [ ]:
#Get number of clients per package
df_id=df1.groupby(['packages']).size().reset_index().rename(columns={0:'count'})
clients=df_id['count'].copy()

count_max=clients.max()

#definition of the  range of the histogram
bins=[x for x in range(0, 1000, 10)]
#creation of the histogram
plt.hist(clients, bins, histtype="bar",rwidth=0.9, color='r')

plt.xlabel('clients')
plt.ylabel('apps')
plt.title('Distribution of packages (600 x 80 000)')
plt2.axis([0, 600, 0, 80000])
plt.grid(True)

plt.show()

#smaller histogram, in order to see more data
bins2=[x for x in range(0, 1000, 10)]

plt2.hist(clients,bins2, histtype="bar",rwidth=0.9, color='r')

plt2.xlabel('clients')
plt2.ylabel('apps')
plt2.title('Distribution of packages (600 x 1 000)')
plt2.axis([0, 600, 0, 1000])
plt2.grid(True)

plt2.show()
"Em média, 65 000 aps foram apenas instaladas por 10 ou menos clientes diferente"

MDS, elbow values

In [ ]:
#Grafico
x_vals=np.arange(len(stress))
plt.plot(x_vals,stress,'ro')
plt.ylabel("")
plt.axis([0, 700, 6000, 100000])
plt.show()

K-means, elbow values

In [ ]:
#Grafico
x_vals=np.arange(len(inertia))
plt.plot(x_vals,inertia,'ro')
plt.ylabel("")
plt.axis([0, 20, 0, 7000])
plt.show()

--------------------------------------------- Extra: ------------------------------------------------------

Extra: Optimize User-Matrix:

E.1)Auxiliar Functions:

In [ ]:
def delete_row_lil(mat, i):
    if not isinstance(mat, scipy.sparse.lil_matrix):
        raise ValueError("works only for LIL format -- use .tolil() first")
    mat.rows = np.delete(mat.rows, i)
    mat.data = np.delete(mat.data, i)
    mat._shape = (mat._shape[0] - 1, mat._shape[1])
In [ ]:
def delete_row_csr(mat, i):
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0]-1, mat._shape[1])

E.2) Mean values and Standart Deviation:

In [ ]:
column_sum=sparse_matrix.sum(axis=0)
row_sum=sparse_matrix.sum(axis=1)

#Mean values
mean_num_pack=np.mean(column_sum)
mean_num_users=np.mean(row_sum)

#Standard deviation
std_pack=np.std(column_sum)
std_users=np.std(row_sum)
In [ ]:
print("Avarage number of users per package:",mean_num_pack)
print("Avarage number of packages per package:",mean_num_users)
print("")
print("Standard deviation of users per package:",std_pack)
print("Standard deviation of packages per package:",std_users)
In [ ]:
#select users which number of packages <= 2*std+mean
max_limit=mean_num_users+2*std_users
users_to_remove=[]
for i in range(0,len(row_sum)):
    a=row_sum[i]
    if not 1<a<=max_limit:
        users_to_remove.append(i)
In [ ]:
#select packages which number of packages <= 3*std+mean
max_limit=mean_num_pack+3*std_pack
packs_to_remove=[]
for i in range(0,column_sum.shape[1]):
    a=column_sum[0,i]
    if not 1<a<=max_limit:
        packs_to_remove.append(i)
#packs=[x for x in column_sum if x<=max_limit ]
In [ ]:
print("num users:",len(row_sum))
print("num packages:",column_sum.shape[1])
print("num users removidos:",len(users_to_remove))
print("num packages removidos:",len(packs_to_remove))
In [ ]:
aa=np.bitwise_and(aux_matrix[0].toarray(), aux_matrix[0].toarray())
print(aa)
np.sum(aa)
In [ ]:
%%timeit
print("before:",aux_matrix.shape)
#Remove users
for index in tqdm(users_to_remove):
    delete_row_csr(aux_matrix, index)
print("after:",aux_matrix.shape)
In [ ]:
print("before:",aux_matrix.shape)
#Remove users:
aux_matrix.tolil(copy=False)  #convert matrix lil format
for index in tqdm(users_to_remove):
    delete_row_csr(aux_matrix, index)

aux_matrix.tocsr(copy=False)  #reconvert to csr
print("after:",aux_matrix.shape)