import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


            
              creditcard_df = pd.read_csv("Marketing_data.csv")


            
              creditcard_df


            
              creditcard_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 18 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   CUST_ID                           8950 non-null   object 
 1   BALANCE                           8950 non-null   float64
 2   BALANCE_FREQUENCY                 8950 non-null   float64
 3   PURCHASES                         8950 non-null   float64
 4   ONEOFF_PURCHASES                  8950 non-null   float64
 5   INSTALLMENTS_PURCHASES            8950 non-null   float64
 6   CASH_ADVANCE                      8950 non-null   float64
 7   PURCHASES_FREQUENCY               8950 non-null   float64
 8   ONEOFF_PURCHASES_FREQUENCY        8950 non-null   float64
 9   PURCHASES_INSTALLMENTS_FREQUENCY  8950 non-null   float64
 10  CASH_ADVANCE_FREQUENCY            8950 non-null   float64
 11  CASH_ADVANCE_TRX                  8950 non-null   int64  
 12  PURCHASES_TRX                     8950 non-null   int64  
 13  CREDIT_LIMIT                      8949 non-null   float64
 14  PAYMENTS                          8950 non-null   float64
 15  MINIMUM_PAYMENTS                  8637 non-null   float64
 16  PRC_FULL_PAYMENT                  8950 non-null   float64
 17  TENURE                            8950 non-null   int64  
dtypes: float64(14), int64(3), object(1)
memory usage: 1.2+ MB


            
              creditcard_df.describe()


            
              # Compra de $40761 en una sola compra. Valor atípico.

creditcard_df[creditcard_df["ONEOFF_PURCHASES"] == 40761.25]


            
              creditcard_df['CASH_ADVANCE'].max()

47137.21176


            
              fig, ax = plt.subplots(figsize=(6,5))
ax.boxplot(creditcard_df['ONEOFF_PURCHASES'])
ax.set_title('Boxplot compra en un pago')
ax.set_ylabel('Pago')
plt.show()


            
              fig, ax = plt.subplots(figsize=(6,5))
ax.boxplot(creditcard_df['BALANCE'])
ax.set_title('Boxplot')
ax.set_ylabel('Pago')
plt.show()

fig, ax = plt.subplots(figsize=(6,5))
ax.boxplot(creditcard_df['PURCHASES'])
ax.set_title('Boxplot')
ax.set_ylabel('Pago')
plt.show()

fig, ax = plt.subplots(figsize=(6,5))
ax.boxplot(creditcard_df['CASH_ADVANCE'])
ax.set_title('Boxplot')
ax.set_ylabel('Pago')
plt.show()

fig, ax = plt.subplots(figsize=(6,5))
ax.boxplot(creditcard_df['PAYMENTS'])
ax.set_title('Boxplot')
ax.set_ylabel('Pago')
plt.show()


            
              # Vamos a ver quien pago por anticipado $47137
# Este cliente hizo un total de 123 transacciones por adelantado
# Nunca paga sus compras completamente con la tarjeta

creditcard_df[creditcard_df['CASH_ADVANCE'] == 47137.211760000006]


            
              # Encontrar outliers en la columna 'ONEOFF_PURCHASES'

Q1 = creditcard_df['ONEOFF_PURCHASES'].quantile(0.25)
Q3 = creditcard_df['ONEOFF_PURCHASES'].quantile(0.75)
IQR = Q3 - Q1

# Definir los valores atípicos
outliers = creditcard_df[(creditcard_df['ONEOFF_PURCHASES'] < (Q1 - 1.5 * IQR)) | (creditcard_df['ONEOFF_PURCHASES'] > (Q3 + 1.5 * IQR))]

print(outliers)

     CUST_ID      BALANCE  BALANCE_FREQUENCY  PURCHASES  ONEOFF_PURCHASES  \
3     C10004  1666.670542           0.636364    1499.00           1499.00   
6     C10007   627.260806           1.000000    7091.01           6402.63   
11    C10012   630.794744           0.818182    1492.18           1492.18   
12    C10013  1516.928620           1.000000    3217.99           2500.23   
21    C10022  6369.531318           1.000000    6359.95           5910.04   
...      ...          ...                ...        ...               ...   
8748  C18987  1042.816735           0.625000    3950.00           3950.00   
8758  C18997  1812.542545           0.857143    2918.08           2918.08   
8801  C19041   275.020950           1.000000    1943.00           1943.00   
8843  C19084   751.590839           0.444444    2000.00           2000.00   
8897  C19138   328.686581           1.000000    2806.78           2588.53   

      INSTALLMENTS_PURCHASES  CASH_ADVANCE  PURCHASES_FREQUENCY  \
3                       0.00    205.788017             0.083333   
6                     688.38      0.000000             1.000000   
11                      0.00      0.000000             0.250000   
12                    717.76      0.000000             1.000000   
21                    449.91    229.028245             1.000000   
...                      ...           ...                  ...   
8748                    0.00      0.000000             0.250000   
8758                    0.00      0.000000             0.571429   
8801                    0.00      0.000000             0.600000   
8843                    0.00      0.000000             0.111111   
8897                  218.25      0.000000             0.857143   

      ONEOFF_PURCHASES_FREQUENCY  PURCHASES_INSTALLMENTS_FREQUENCY  \
3                       0.083333                          0.000000   
6                       1.000000                          1.000000   
11                      0.250000                          0.000000   
12                      0.250000                          0.916667   
21                      0.916667                          1.000000   
...                          ...                               ...   
8748                    0.250000                          0.000000   
8758                    0.571429                          0.000000   
8801                    0.600000                          0.000000   
8843                    0.111111                          0.000000   
8897                    0.714286                          0.285714   

      CASH_ADVANCE_FREQUENCY  CASH_ADVANCE_TRX  PURCHASES_TRX  CREDIT_LIMIT  \
3                   0.083333                 1              1        7500.0   
6                   0.000000                 0             64       13500.0   
11                  0.000000                 0              6        2000.0   
12                  0.000000                 0             26        3000.0   
21                  0.333333                 6             92       11250.0   
...                      ...               ...            ...           ...   
8748                0.000000                 0              3        3150.0   
8758                0.000000                 0              5        3000.0   
8801                0.000000                 0              9         500.0   
8843                0.000000                 0              1        2000.0   
8897                0.000000                 0             26        1200.0   

         PAYMENTS  MINIMUM_PAYMENTS  PRC_FULL_PAYMENT  TENURE  
3        0.000000               NaN          0.000000      12  
6     6354.314328        198.065894          1.000000      12  
11     705.618627        155.549069          0.000000      12  
12     608.263689        490.207013          0.250000      12  
21    2077.959051       1659.775075          0.000000      12  
...           ...               ...               ...     ...  
8748  6274.982741        303.462882          0.000000       8  
8758   247.249275        297.395431          0.000000       7  
8801  1438.120632        142.241273          0.555556      10  
8843   265.918137        353.102768          0.000000       9  
8897  1796.886852         93.017466          0.250000       7  

[1013 rows x 18 columns]


            
              print(outliers.shape) # Muestra la cantidad de filas y columnas de los outliers

(1013, 18)


            
              print(outliers['ONEOFF_PURCHASES'].describe()) # Muestra estadísticas básicas de los outliers

count     1013.000000
mean      3546.976594
std       3678.788638
min       1445.140000
25%       1842.640000
50%       2475.930000
75%       3721.390000
max      40761.250000
Name: ONEOFF_PURCHASES, dtype: float64


            
              creditcard_df.isnull().sum()

CUST_ID                               0
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64


            
              # Vamos a rellenar los datos faltantes con el promedio del campo 'MINIMUM_PAYMENT'
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard_df['MINIMUM_PAYMENTS'].mean()


            
              # Vamos a rellenar los datos faltantes con el promedio del campo 'CREDIT_LIMIT'
creditcard_df.loc[(creditcard_df['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditcard_df['CREDIT_LIMIT'].mean()


            
              creditcard_df.isnull().sum()

CUST_ID                             0
BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
TENURE                              0
dtype: int64


            
              # Varifiquemos si tenemos entradas duplicadas en nuestros datos
creditcard_df.duplicated().sum()

0


            
              # Podemos deshacernos del campo Customer ID ya que no nos sirve para nada
creditcard_df.drop("CUST_ID", axis = 1, inplace=True)


            
              creditcard_df.head()


            
              n = len(creditcard_df.columns)
n

17


            
              creditcard_df.columns

Index(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'TENURE'],
      dtype='object')


            
              plt.figure(figsize = (10, 50))
for i in range(n):
    plt.subplot(n, 1, i+1)
    sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
    plt.title(creditcard_df.columns[i])

plt.tight_layout()

C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})
C:\Users\alain\AppData\Local\Temp\ipykernel_13364\4006367705.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws = {"color": "r", "lw": 1, "label": "KDE"}, hist_kws={"color": "b"})


            
              # sns.pairplot(creditcard_df)
# Hay correlación entre 'PURCHASES' y ONEOFF_PURCHASES & INSTALMENT_PURCHASES
# Se ve una tendencia entre 'PURCHASES' y 'CREDIT_LIMIT' & 'PAYMENTS'


            
              correlations = creditcard_df.corr()


            
              f, ax = plt.subplots(figsize = (20,20))
sns.heatmap(correlations, annot = True)
# 'PURCHASES' tienen una alta correlación con one-off purchases, 'installment purchases, purchase transactions, credit limit y payments.
# Correlación positiva muy elevada entre 'PURCHASES_FREQUENCY' y 'PURCHASES_INSTALLMENT_FREQUENCY'

<Axes: >


            
              # Empecemos por escalar primero el dataset
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(creditcard_df)


            
              creditcard_df_scaled.shape


            
              creditcard_df_scaled


            
              scores_1 = []

range_values = range(1, 20)

for i in range_values:
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(creditcard_df_scaled)
    scores_1.append(kmeans.inertia_) #WCSS


plt.plot(range_values, scores_1, 'bx-')
plt.title("Encontrar el número óptimo de Clusters")
plt.xlabel("Clusters")
plt.ylabel("WCSS(k)")
plt.show()



# Con el gráfico podemos ver que en 4 clusters es donde se forma el codo de la curva.
# Sin embargo, los valores no se reducen a una forma lineal hasta el 8º cluster.
# Elijamos pues un número de clusters igual a 8.


            
              kmeans = KMeans(8)
kmeans.fit(creditcard_df_scaled)
labels = kmeans.labels_


            
              kmeans.cluster_centers_.shape


            
              cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns=[creditcard_df.columns])
cluster_centers


            
              # Para entender mejor estos valores, vamos a aplicar la transformación inversa.
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns=[creditcard_df.columns])
cluster_centers

# Primer Cluster de Clientes (Transactors): Esos son los clientes que pagan la menor cantidad de cargos por intereses y tienen cuidado con su dinero, Clúster con el saldo más bajo ($ 104) y anticipo en efectivo ($ 303), Porcentaje de pago completo = 23%
# Segundo Cluster de Clientes (Revolvers) que usan tarjeta de crédito como préstamo (sector más lucrativo): saldo más alto ($ 5000) y anticipo en efectivo (~ $ 5000), baja frecuencia de compra, alta frecuencia de anticipo en efectivo (0.5), transacciones de anticipo en efectivo alto (16) y bajo porcentaje de pago (3%)
# Tercer Cluster de Clientes (VIP/Prime): límite de crédito alto $ 16K y porcentaje más alto de pago completo, objetivo para aumentar el límite de crédito y aumentar los hábitos de gasto
# Cuarto Cluster de Clientes (low tenure): estos son clientes con baja antigüedad (7 años), saldo bajo


            
              labels.shape


            
              labels.min()


            
              labels.max()


            
              y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
y_kmeans


            
              # Concatenamos las etiquetas de los clusters con el dataset riginal
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster': labels})], axis = 1)
creditcard_df_cluster.head()


            
              # Visualizamos histogramas para cada cluster
for i in creditcard_df.columns:
    plt.figure(figsize=(35, 5))
    for j in range(8):
        plt.subplot(1, 8, j+1)
        cluster = creditcard_df_cluster[creditcard_df_cluster['cluster'] == j]
        cluster[i].hist(bins = 20)
        plt.title('{}    \nCluster {}'.format(i, j))
    plt.show()


            
              # Obtenemos las componentes principales
pca = PCA(n_components = 2)
principal_comp = pca.fit_transform(creditcard_df_scaled)
principal_comp


            
              # Creamos un dataframe con las dos componentes
pca_df = pd.DataFrame(data = principal_comp, columns=["pca1", "pca2"])
pca_df.head()


            
              # Concatenamos las etiquetas de los clusters con el dataframe de las componentes principales

pca_df = pd.concat([pca_df, pd.DataFrame({'cluster':labels})], axis = 1)

pca_df.head()


            
              plt.figure(figsize=(10,10))
ax = sns.scatterplot(x = "pca1", y = "pca2", hue = "cluster", data = pca_df,
                     palette = ["red", "green", "blue", "pink", "yellow", "gray", "purple", "black"])
plt.show()


            
              from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from keras.optimizers import SGD

encoding_dim = 7

input_df = Input(shape = (17, ))

# Glorot normal inicializador (Xavier normal initializer) tomar muestras aleatorias de una distribución normal truncada

x = Dense(encoding_dim, activation = 'relu')(input_df)
x = Dense(500, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(500, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)

encoded = Dense(10, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)

x = Dense(2000, activation = 'relu', kernel_initializer = 'glorot_uniform')(encoded)
x = Dense(500, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)

decoded = Dense(17, kernel_initializer = 'glorot_uniform')(x)


autoencoder = Model(input_df, decoded)

encoder = Model(input_df, encoded)

autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error')


            
              creditcard_df_scaled.shape


            
              autoencoder.summary()


            
              autoencoder.fit(creditcard_df_scaled, creditcard_df_scaled, batch_size=128, epochs = 25, verbose = 1)


            
              autoencoder.save_weights('autoencoder.h5')


            
              pred = encoder.predict(creditcard_df_scaled)


            
              pred.shape


            
              scores_2 = []

range_values = range(1,20)

for i in range_values:
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(pred)
    scores_2.append(kmeans.inertia_)


plt.plot(range_values, scores_2, 'bx-')
plt.title("Encontrar el número óptimo de clusters")
plt.xlabel("Número de Clusters")
plt.ylabel("WCSS(k)")
plt.show()


            
              plt.plot(range_values, scores_1, 'bx-', color = "r")
plt.plot(range_values, scores_2, 'bx-', color = "g")


            
              kmeans = KMeans(4)
kmeans.fit(pred)
labels = kmeans.labels_
y_kmeans = kmeans.fit_predict(pred)


            
              df_cluster_dr = pd.concat([creditcard_df, pd.DataFrame({'cluster': labels})], axis = 1)
df_cluster_dr.head()


            
              pca = PCA(n_components=2)
princ_comp = pca.fit_transform(pred)
pca_df = pd.DataFrame(data = princ_comp, columns=["pca1", "pca2"])
pca_df.head()


            
              pca_df = pd.concat([pca_df, pd.DataFrame({"cluster":labels})], axis = 1)
pca_df


            
              plt.figure(figsize=(10,10))
ax = sns.scatterplot(x="pca1", y = "pca2", hue="cluster", data = pca_df, palette=["red", "green", "blue", "yellow"])
plt.show()

	BALANCE	BALANCE_FREQUENCY	PURCHASES	ONEOFF_PURCHASES	INSTALLMENTS_PURCHASES	CASH_ADVANCE	PURCHASES_FREQUENCY	ONEOFF_PURCHASES_FREQUENCY	PURCHASES_INSTALLMENTS_FREQUENCY	CASH_ADVANCE_FREQUENCY	CASH_ADVANCE_TRX	PURCHASES_TRX	CREDIT_LIMIT	PAYMENTS	MINIMUM_PAYMENTS	PRC_FULL_PAYMENT	TENURE
count	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8949.000000	8950.000000	8637.000000	8950.000000	8950.000000
mean	1564.474828	0.877271	1003.204834	592.437371	411.067645	978.871112	0.490351	0.202458	0.364437	0.135144	3.248827	14.709832	4494.449450	1733.143852	864.206542	0.153715	11.517318
std	2081.531879	0.236904	2136.634782	1659.887917	904.338115	2097.163877	0.401371	0.298336	0.397448	0.200121	6.824647	24.857649	3638.815725	2895.063757	2372.446607	0.292499	1.338331
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	50.000000	0.000000	0.019163	0.000000	6.000000
25%	128.281915	0.888889	39.635000	0.000000	0.000000	0.000000	0.083333	0.000000	0.000000	0.000000	0.000000	1.000000	1600.000000	383.276166	169.123707	0.000000	12.000000
50%	873.385231	1.000000	361.280000	38.000000	89.000000	0.000000	0.500000	0.083333	0.166667	0.000000	0.000000	7.000000	3000.000000	856.901546	312.343947	0.000000	12.000000
75%	2054.140036	1.000000	1110.130000	577.405000	468.637500	1113.821139	0.916667	0.300000	0.750000	0.222222	4.000000	17.000000	6500.000000	1901.134317	825.485459	0.142857	12.000000
max	19043.138560	1.000000	49039.570000	40761.250000	22500.000000	47137.211760	1.000000	1.000000	1.000000	1.500000	123.000000	358.000000	30000.000000	50721.483360	76406.207520	1.000000	12.000000

DEPARTAMENTO DE MARKETING¶

IMPORTAR LAS LIBRERÍAS Y EL DATASET¶

VISUALIZACIÓN DEL DATASET¶

K-MEANS¶

ENCONTRAR EL NÚMERO ÓPTIMO DE CLISTERS UTILIZANDO EL MÉTODO DEL CODO¶

APLICAR EL MÉTODO DE K-MEANS¶

APLICAR ANÁLISIS DE LAS COMPONENTES PRINCIPALES Y VISUALIZAR LOS RESULTADOS¶

AUTOENCODERS¶

	CUST_ID	BALANCE	BALANCE_FREQUENCY	PURCHASES	ONEOFF_PURCHASES	INSTALLMENTS_PURCHASES	CASH_ADVANCE	PURCHASES_FREQUENCY	ONEOFF_PURCHASES_FREQUENCY	PURCHASES_INSTALLMENTS_FREQUENCY	CASH_ADVANCE_FREQUENCY	CASH_ADVANCE_TRX	PURCHASES_TRX	CREDIT_LIMIT	PAYMENTS	MINIMUM_PAYMENTS	PRC_FULL_PAYMENT	TENURE
0	C10001	40.900749	0.818182	95.40	0.00	95.40	0.000000	0.166667	0.000000	0.083333	0.000000	0	2	1000.0	201.802084	139.509787	0.000000	12
1	C10002	3202.467416	0.909091	0.00	0.00	0.00	6442.945483	0.000000	0.000000	0.000000	0.250000	4	0	7000.0	4103.032597	1072.340217	0.222222	12
2	C10003	2495.148862	1.000000	773.17	773.17	0.00	0.000000	1.000000	1.000000	0.000000	0.000000	0	12	7500.0	622.066742	627.284787	0.000000	12
3	C10004	1666.670542	0.636364	1499.00	1499.00	0.00	205.788017	0.083333	0.083333	0.000000	0.083333	1	1	7500.0	0.000000	NaN	0.000000	12
4	C10005	817.714335	1.000000	16.00	16.00	0.00	0.000000	0.083333	0.083333	0.000000	0.000000	0	1	1200.0	678.334763	244.791237	0.000000	12
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8945	C19186	28.493517	1.000000	291.12	0.00	291.12	0.000000	1.000000	0.000000	0.833333	0.000000	0	6	1000.0	325.594462	48.886365	0.500000	6
8946	C19187	19.183215	1.000000	300.00	0.00	300.00	0.000000	1.000000	0.000000	0.833333	0.000000	0	6	1000.0	275.861322	NaN	0.000000	6
8947	C19188	23.398673	0.833333	144.40	0.00	144.40	0.000000	0.833333	0.000000	0.666667	0.000000	0	5	1000.0	81.270775	82.418369	0.250000	6
8948	C19189	13.457564	0.833333	0.00	0.00	0.00	36.558778	0.000000	0.000000	0.000000	0.166667	2	0	500.0	52.549959	55.755628	0.250000	6
8949	C19190	372.708075	0.666667	1093.25	1093.25	0.00	127.040008	0.666667	0.666667	0.000000	0.333333	2	23	1200.0	63.165404	88.288956	0.000000	6