Project

1.Libraries

In [ ]:
#matrix math
import numpy as np

#Allows to vizualize data
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt2

#import and convert dataset
import pandas as pd

#Used to save smaller parts of data in pickle files
import pickle

#Used to create sparse matrix
from scipy.sparse import csr_matrix

#Garbage Colector
import gc
In [ ]:
##GLOBAL VARS:
#guarda ate que indice já foi feita a leitura do dataset
glb_index=0

#Guarda os nomes dos pickles em que vão ser guardados o array de user::packages
glb_pickles_names_list=[]

#Contador de pickles criados
glb_pickles_count=0

#Guarda o numero de USERS
glb_num_users=0

#Multi threading
import multiprocessing

2.Import

In [ ]:
#load dataset (2D data points)
# for each user, what packages has he installed?
#Create a dataSet |User_ID|Package
#df=pd.read_csv("dataset1.csv", sep=',')
Location = r'/home/pedros/pCloudDrive/Projeto/K-means project/dataset.csv'
with open(Location) as f:
    lines = f.read().splitlines()
lines = lines [1:]

3. Convert Data

In [ ]:
#load pickle
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
In [ ]:
#save dict in a pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    # glb_pack_list=glb_pack_list + pkgs
In [ ]:
def spli_packages(lista):
    """Input: range_min- index of the first line of the array lista[] to be splitted
              range_max- index of the last line of the array lista[] to be splitted
              name- name of the pickle in which data will be saved
              lista- name of the array to be splitted"""
    #dicionario em que vão ser guardados os id e os respetivos packages
    df = []
    
    #iterate the array lista form range_min to range_max
    for line in lista:
        #split the line into: uid - id of the client; pkgs - string of all packages installed by the uid user 
        uid, pkgs = line.split(',',1)
        #split the string into a list of packages installed by the uid user
        pkgs = pkgs.replace('"','').split(',')
        #save the array of packages which belong to the uid user, using the glb index
        df.append(pkgs)
    
    #save the created dictionary in a pickle called "name" 
    return df
In [ ]:
#Total number of users
glb_num_users=len(lines)

dados= spli_packages(lines)
In [ ]:
save_obj(dados,"dataset1_pickle")
In [ ]:
dados=load_obj("dataset1_pickle")

Optional (Only used to vizualize data):

In [ ]:
#Optional, visulaize data, one package per line
df_h=df.copy()
rows = []
for i, row in df_h.iterrows():
    for a in row.packages:
        rows.append([row.id,a])

extended_df=pd.DataFrame(rows, columns=['id','packages'])
extended_df
In [ ]:
id_count=len(extended_df["id"].drop_duplicates())
packages_count=b=len(extended_df["packages"].drop_duplicates())
print("clients:", id_count)
print("packages:", packages_count)
print("resulting matrix:", id_count*packages_count)

User-App Matrix M:

In [ ]:
def create_sparse_matrix(info):
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for d in info:
        for term in d:
            index = vocabulary.setdefault(term, len(vocabulary))
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))

    sparse_matrix= csr_matrix((data, indices, indptr), dtype=int)
    return sparse_matrix
In [ ]:
#create_sparse_matrix(dados)
dados=None

#save matrix
save_obj(sparse_matrix,"sparse_matrix")
In [ ]:
# !!! DANGER!!!! RAM KILLER
#Old solution used to create de matrix

# create a binary "pivot table" which will give us the information if a client has a certain app
matrix = pd.pivot_table(df2,index=['id'], columns=['packages'], values=['n'])
print(len(matrix.values[1]))

Histograms

Number of packages per client

In [ ]:
#Get number of packages per client
df1=extended_df.copy()
df_id=df1.groupby(['id']).size().reset_index().rename(columns={0:'count'})
apps=df_id['count'].copy()

count_max=apps.max()

#definition of the  range of the histogram
bins=[x for x in range(0, 600, 10)]
#creation of the histogram
plt.hist(apps, bins, histtype="bar",rwidth=0.9)

plt.xlabel('number of apps')
plt.ylabel('clients')
plt.title('Distribution of clients (300 x 6000)')
plt2.axis([0, 300, 0, 6000])
plt.grid(True)

plt.show()

#smaller histogram, in order to see more data
bins2=[x for x in range(0, 600, 10)]

plt2.hist(apps,bins2, histtype="bar",rwidth=0.9)

plt2.xlabel('number of apps')
plt2.ylabel('clients')
plt2.title('Distribution of clients (300 x 100)')
plt2.axis([0, 300, 0, 100])
plt2.grid(True)

plt2.show()

"Em média, 1500 clientes diferentes instalaram 10 ou menos aps diferente"

In [ ]:
#Get number of clients per package
df_id=df1.groupby(['packages']).size().reset_index().rename(columns={0:'count'})
clients=df_id['count'].copy()

count_max=clients.max()

#definition of the  range of the histogram
bins=[x for x in range(0, 1000, 10)]
#creation of the histogram
plt.hist(clients, bins, histtype="bar",rwidth=0.9, color='r')

plt.xlabel('clients')
plt.ylabel('apps')
plt.title('Distribution of packages (600 x 80 000)')
plt2.axis([0, 600, 0, 80000])
plt.grid(True)

plt.show()

#smaller histogram, in order to see more data
bins2=[x for x in range(0, 1000, 10)]

plt2.hist(clients,bins2, histtype="bar",rwidth=0.9, color='r')

plt2.xlabel('clients')
plt2.ylabel('apps')
plt2.title('Distribution of packages (600 x 1 000)')
plt2.axis([0, 600, 0, 1000])
plt2.grid(True)

plt2.show()

"Em média, 65 000 aps foram apenas instaladas por 10 ou menos clientes diferente"