import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
import warnings
os.environ['OMP_NUM_THREADS'] = '19'
alt.data_transformers.disable_max_rows() #disables max rows for altair

DataTransformerRegistry.enable('default')


#dataset
spotify_data = pd.read_csv("C:/Users/patri/datasets/spotify songs/Spotify_Song_Attributes.csv")

#Select attributes of interest
selected_attributes = ['id', 'artistName', 'trackName', 'msPlayed', 'genre', 'danceability', 'energy', 'key', 
                   'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo', 'type', 'time_signature']

spotify_f = spotify_data[selected_attributes] #Filtered dataset

missing_vals = spotify_f.isna().sum() #store missing values
print(missing_vals) #show missing values

#fill in missing values w/ 'Unknown' for genre
spotify_f['genre'].fillna('Unknown', inplace = True) 

spotify_f = spotify_f.loc[spotify_f['id'].notna()] #drop observations with a missing ID

spotify_f.isna().sum()

#checking for duplicates
duplicate_rows = spotify_f[spotify_f.duplicated()]
len(duplicate_rows)

spotify_f = spotify_f.drop_duplicates().reset_index(drop = True)
spotify_f.tail()

id                   550
artistName             0
trackName              0
msPlayed               0
genre               1500
danceability         550
energy               550
key                  550
loudness             550
mode                 550
speechiness          550
acousticness         550
instrumentalness     550
liveness             550
valence              550
tempo                550
type                 550
time_signature       550
dtype: int64

C:\Users\patri\AppData\Local\Temp\ipykernel_9212\1603919404.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_f['genre'].fillna('Unknown', inplace = True)


data_types = [spotify_f.dtypes]
print(data_types) #Checking data types

#assign attributes data types
objects = ['id']
categories = ['artistName', 'trackName', 'genre', 'key', 'mode', 'type', 'time_signature']
integers = ['msPlayed']
floats = ['danceability', 'energy', 'loudness', 'speechiness',
         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

spotify_f[objects] = spotify_f[objects].astype('object')
spotify_f[categories] = spotify_f[categories].astype('category')
spotify_f[integers] = spotify_f[integers].astype(int)
spotify_f[floats] = spotify_f[floats].astype('float64')

spotify_f.dtypes

#Reformat attributes

spotify_f['secondsPlayed'] = spotify_f['msPlayed']/1000 #appends new column w/ seconds played
spotify_f.sort_values(by = 'artistName', ascending = True, inplace = True) #sort artist name

columns_normalize = ['danceability', 'energy', 'loudness', 'speechiness', 
                        'acousticness', 'liveness', 'valence', 'tempo']

#Gaussian normalization
for column in columns_normalize:
    spotify_f[column] = (spotify_f[column] - spotify_f[column].mean()) / (spotify_f[column].std())

spotify_f.head()

[id                   object
artistName           object
trackName            object
msPlayed              int64
genre                object
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
time_signature      float64
dtype: object]


def plot_histogram(dataframe, attribute_name, title):
    '''
    Function that makes it convenient to plot several histograms.
    dataframe = pandas dataframe object
    attribute_name = string identifying the attribute you would like to plot on the x-axis
    title = string that will title the x-axis and part of the chart title
    '''
    histogram = alt.Chart(dataframe).mark_bar().encode(
        alt.X(f'{attribute_name}:Q', bin = alt.Bin(maxbins = 50), title = title), y = 'count()'
    ).properties(
        title = f'Histogram of {title}',
        width = 250,
        height = 250
    )
    return histogram

#histograms
loudness_histogram = plot_histogram(spotify_f, 'loudness', 'Loudness')
tempo_histogram = plot_histogram(spotify_f, 'tempo', 'Tempo')
danceability_histogram = plot_histogram(spotify_f, 'danceability', 'Danceability')
valence_histogram = plot_histogram(spotify_f, 'valence', 'Valence')
energy_histogram = plot_histogram(spotify_f, 'energy', 'Energy')

#plots
piped_plots = loudness_histogram | tempo_histogram | danceability_histogram | valence_histogram | energy_histogram
piped_plots


#Plot 1 - scatterplot

#create dropdowns and options for x, y axes
dropdown_x = alt.binding_select(options = columns_normalize, name = "X-axis Attribute:")
dropdown_y = alt.binding_select(options = columns_normalize, name = "Y-axis Attribute:")
dropdown_genre = alt.binding_select(options = spotify_f['genre'].value_counts().index.to_list(), 
                                    name = "Highlight Genre:")

#parameters to add to the plot
selection_x = alt.param(value = ['loudness'], bind = dropdown_x)
selection_y = alt.param(value = ['energy'], bind = dropdown_y)

#defines the selection for genres, defaults to all genres being equally salient
selection_genre = alt.selection_point(value = None, fields = ['genre'], bind = dropdown_genre)

#dynamic scatterplot
s_plot = alt.Chart(spotify_f).mark_point().encode(
    x=alt.X('x_attribute:Q', title = 'X-Attribute'),
    y=alt.Y('y_attribute:Q', title = 'Y-Attribute'),
    color = alt.condition(selection_genre, alt.Color('genre:N').legend(None), alt.value('lightgray')), #color is set by genre
    opacity = alt.condition(selection_genre, alt.value(1), alt.value(0.07)), #opacity is set by the dropdown selection
    tooltip = ['artistName', 'trackName'] #tooltip will show artist and track name
).properties(
    width=500,
    height=500
).transform_calculate(
    x_attribute = f'datum[{selection_x.name}]',
    y_attribute = f'datum[{selection_y.name}]',
    plot_filter = f'datum[{selection_genre.name}]'
).add_params(
    selection_x, selection_y,
    selection_genre
).interactive( #allows you to zoom and pan on the plot
).properties(
    title = 'Plot of selected X and Y attributes'
)


# Plot 2 - heatmap

#calculating correlation matrix
corr_data = spotify_f[columns_normalize] 
correlation_matrix = corr_data.corr().round(decimals = 2) #calculate the correlation matrix
correlation_long = correlation_matrix.unstack().reset_index() #convert correlation matrix into long-form dataframe
correlation_long.columns = ['Attribute1', 'Attribute2', 'Correlation']


#plotting correlation heatmap

#base chart holdling data to add marks to

base = alt.Chart(correlation_long).encode(
    x = alt.X('Attribute1:O', title = None),
    y = alt.Y('Attribute2:O', title = None))

#adds the heatmap to the base chart
corr_heatmap = base.mark_rect().encode(
    color = alt.Color('Correlation:Q', scale = alt.Scale(domain = [-1,1], scheme = 'redblue'),
                       legend = alt.Legend(title = 'Correlation'))
).properties(
    title = "Correlation Heatmap",
    width = 500,
    height = 500
)

#add text of corresponding correlation coefficient to each square
coeff_text = base.mark_text(baseline = 'middle', fontSize = 12).encode(
    text = "Correlation:Q",
    color = alt.condition(
        alt.datum.Correlation == 1,
        alt.value('white'),
        alt.value('black'))
)

full_heatmap = corr_heatmap + coeff_text
heatmap_scatter = full_heatmap & s_plot
heatmap_scatter.save('heatmap_scatter.html')
heatmap_scatter


warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
#scale our data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(spotify_f[columns_normalize])

#Setting K using the elbow method
ssd = []
k_poss = range(1,20) #range of possible K's

for k in k_poss:
    kmeans = KMeans(n_clusters = k, n_init = 10)
    kmeans.fit(scaled_data)
    ssd.append(kmeans.inertia_)
    
#Elbow-curve plot
plt.figure(figsize = (6,6))
plt.plot(k_poss, ssd, marker = 'o')
plt.xlabel('# Clusters (K)')
plt.ylabel('SSD')
plt.title('Plot of Elbow-Curve')
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


from sklearn.metrics import silhouette_score
#K-means clustering
k_means_model = KMeans(n_clusters = 4, n_init = 10, random_state = 55)
spotify_f['cluster'] = k_means_model.fit_predict(scaled_data)

#Checking size of each cluster
k_counts = spotify_f['cluster'].value_counts().reset_index()
k_counts.columns = ['Cluster', 'Count']

#Bar chart of cluster counts
k_counts_plot = alt.Chart(k_counts).mark_bar().encode(
    x = alt.X('Cluster:O'),
    y = alt.Y('Count:Q'),
    color = 'Cluster:N'
).properties(
    title = 'Number of Tracks in each Cluster',
    width = 300,
    height = 300
)

#calculate silhouette score
sil_coeff = silhouette_score(scaled_data, spotify_f['cluster'])
print(f'The silhouette score is : {sil_coeff}')

k_counts_plot

The silhouette score is : 0.3002957118257905


#Group the data by cluster and calculate feature means
feature_means = spotify_f.groupby('cluster').mean()[columns_normalize]
feature_means.reset_index(inplace = True)
feature_means

#Convert data into long format
feature_means_long = feature_means.melt(id_vars = 'cluster', var_name = 'feature', value_name = 'mean')

feature_chart = alt.Chart(feature_means_long).mark_bar().encode(
    x='cluster:N',
    y='mean:Q',
    color='cluster:N',
    facet='feature:N'
).properties(
    title='Feature Means by Cluster',
    width=200,
    height=200
)

feature_chart


from sklearn.preprocessing import OneHotEncoder

#Combining valence and energy into a new feature 'mood'
spotify_f['mood'] = spotify_f['valence'] * spotify_f['energy']

#Combining tempo and danceability into a new feature 'rhythm'
spotify_f['rhythm'] = spotify_f['tempo'] * spotify_f['danceability']

'''
Use onehot-encoding to add top 10 genres as features, assign all other genres to 'Other'.
Since 'Unknown' is the most frequent genre, we'll encode it as 'Other' as well.
'''

#get top 10 genres and replace other genres with 'Other'
top_10_genres = spotify_f['genre'].value_counts().index[1:11].tolist()
spotify_f['genre'] = spotify_f['genre'].apply(lambda x: x if x in top_10_genres else 'Other')

#one-hot encode the genres
def onehot_encode(df, column):
    """
    Idempotently one-hot encodes a column in a dataframe.
    
    Parameters:
    - df: The dataframe.
    - column: The column to one-hot encode.
    
    Returns:
    - The dataframe with the column one-hot encoded.
    """
    #check if the column exists in the dataframe
    if column not in df.columns:
        return df
    
    #check if the column has already been one-hot encoded by checking for the presence of expected encoded columns
    encoder = OneHotEncoder(sparse=False)
    encoded = encoder.fit_transform(df[[column]])
    encoded_columns = encoder.get_feature_names_out([column])
    
    if set(encoded_columns).issubset(df.columns):
        return df
    
    #perform one-hot encoding
    encoded_df = pd.DataFrame(encoded, columns = encoded_columns).reset_index(drop = True)
    df = pd.concat([df.reset_index(drop = True), encoded_df], axis = 1)
    
    #drop the original column
    df = df.drop(columns = column)
    
    return df

spotify_f = onehot_encode(spotify_f, 'genre')
spotify_new_features = spotify_f

print(spotify_new_features['mood'].describe())
print(spotify_new_features['rhythm'].describe())
print(spotify_new_features.columns)

count    4765.000000
mean        0.448453
std         0.985256
min        -2.775555
25%        -0.094407
50%         0.192997
75%         0.898463
max         3.913854
Name: mood, dtype: float64
count    4765.000000
mean        0.009499
std         1.187893
min        -8.239700
25%        -0.379486
50%        -0.000174
75%         0.324163
max        15.723557
Name: rhythm, dtype: float64
Index(['id', 'artistName', 'trackName', 'msPlayed', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type',
       'time_signature', 'secondsPlayed', 'cluster', 'mood', 'rhythm',
       'genre_Other', 'genre_alt z', 'genre_alternative metal',
       'genre_anime lo-fi', 'genre_art pop', 'genre_brostep',
       'genre_dance pop', 'genre_drift phonk', 'genre_filmi', 'genre_pop',
       'genre_singer-songwriter pop'],
      dtype='object')


'''
drop the old features since the new features were constructed using them and they will be highly
correlated and redundant
'''
spotify_new_features = spotify_f.drop(columns = ['valence', 'energy','tempo','danceability'])

#normalize the numerical features
numerical_features = ['loudness', 'speechiness','acousticness','liveness','mood', 'rhythm', 
                      'instrumentalness']
features_for_clustering = numerical_features + [col for col in spotify_f.columns if 'genre_' in col]

scaled_data = scaler.fit_transform(spotify_new_features[features_for_clustering])
scaled_data.shape

(4765, 18)


#elbow method to determine optimal number of clusters
wcss = []

for i in range(1,20):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 55)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)
    
#elbow graph #2
plt.figure(figsize = (12,8))
plt.plot(range(1,20), icss, marker = 'o', linestyle = '--')
plt.title('Elbow Method Plot (New Features)')
plt.xlabel('# of Clusters')
plt.xticks(range(0,21))
plt.ylabel('ICSS')
plt.grid(True)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


from sklearn.decomposition import PCA

#apply k-means clustering
kmeans = KMeans(n_clusters = 12, random_state = 55)
spotify_new_features['cluster'] = kmeans.fit_predict(scaled_data)
cluster_counts = spotify_new_features['cluster'].value_counts()

plt.figure(figsize = (12,8))
plot = plt.bar(range(0,12), cluster_counts)
plt.xlabel('Cluster')
plt.xticks(range(0,12))
plt.yticks(range(0,3500,250))
plt.ylabel('Number of Songs in Cluster')
plt.title('Count of Songs by Cluster')
for bar in plot:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.5, str(height), ha = 'center', va = 'bottom')
plt.show()


feature_means = spotify_new_features.groupby('cluster').mean()[numerical_features]
feature_means


feature_means = spotify_new_features.groupby('cluster').mean()[numerical_features]

n_features = len(numerical_features)
fig, axes = plt.subplots(nrows=n_features, figsize=(10, 5*n_features))
cmap = plt.get_cmap('tab10')
colors = cmap(np.arange(len(feature_means.index.unique())) % cmap.N)

for ax, feature in zip(axes, feature_means):
    ax.bar(feature_means.index, feature_means[feature], color=colors)
    ax.set_xlabel('Cluster')
    ax.set_ylabel(f'Mean {feature}')
    ax.set_title(f'Mean {feature} by Cluster')
    ax.set_xticks(range(0, max(spotify_new_features['cluster']) + 1))

plt.tight_layout()
plt.show()


genres = [col for col in spotify_new_features.columns if 'genre_' in col]
genre_counts = spotify_new_features.groupby('cluster').sum()[genres]
print(genre_counts)

         genre_Other  genre_alt z  genre_alternative metal  genre_anime lo-fi  \
cluster                                                                         
0              587.0          1.0                      0.0                0.0   
1                0.0          0.0                      0.0                0.0   
2             2849.0          0.0                      0.0                0.0   
3                0.0        327.0                      0.0                0.0   
4                0.0          0.0                      0.0               68.0   
5                0.0          0.0                      0.0                0.0   
6                0.0          0.0                      0.0                0.0   
7                0.0          0.0                      0.0                0.0   
8                0.0          0.0                     75.0                0.0   
9                0.0          0.0                      0.0                0.0   
10               0.0          0.0                      0.0                0.0   
11               0.0          0.0                      0.0                0.0   

         genre_art pop  genre_brostep  genre_dance pop  genre_drift phonk  \
cluster                                                                     
0                  0.0            0.0              0.0                0.0   
1                  0.0            0.0              0.0                0.0   
2                  0.0            0.0              0.0                0.0   
3                  0.0            0.0              0.0                0.0   
4                  0.0            0.0              0.0                0.0   
5                  0.0            0.0             86.0                0.0   
6                  0.0           58.0              0.0                0.0   
7                  0.0            0.0              0.0               62.0   
8                  0.0            0.0              0.0                0.0   
9                  0.0            0.0              0.0                0.0   
10                63.0            0.0              0.0                0.0   
11                 0.0            0.0              0.0                0.0   

         genre_filmi  genre_pop  genre_singer-songwriter pop  
cluster                                                       
0                0.0        0.0                          0.0  
1              206.0        0.0                          0.0  
2                0.0        0.0                          0.0  
3                0.0        0.0                          0.0  
4                0.0        0.0                          0.0  
5                0.0        0.0                          0.0  
6                0.0        0.0                          0.0  
7                0.0        0.0                          0.0  
8                0.0        0.0                          0.0  
9                0.0        0.0                         82.0  
10               0.0        0.0                          0.0  
11               0.0      301.0                          0.0


sil_coeff = silhouette_score(scaled_data, spotify_new_features['cluster'])
print(f'The silhouette score is : {sil_coeff}')

The silhouette score is : 0.45048255805644133

	id	artistName	trackName	msPlayed	genre	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	type	time_signature
4760	2qXicQG06oT0ijKBznpgQv	Ruel	Younger	5272303	alt z	0.745	0.477	11.0	-7.706	0.0	0.0880	0.202	0.000000	0.120	0.454	136.055	audio_features	4.0
4761	6o8pM5reLgjd5i8gDY3Irt	Ben Zaidi	Younger with Time.	668478	folk-pop	0.537	0.143	2.0	-16.992	1.0	0.0331	0.961	0.005720	0.110	0.245	131.118	audio_features	3.0
4762	1EoThnDm6kQfB2idIfR30n	just valery	Your Love Is My Drug (8 Bit Slowed)	97600	sad lo-fi	0.282	0.158	6.0	-7.783	1.0	0.0311	0.438	0.134000	0.474	0.248	65.152	audio_features	4.0
4763	042Sl6Mn83JHyLEqdK7uI0	Billie Eilish	Your Power	988224	art pop	0.632	0.284	9.0	-14.025	0.0	0.0801	0.932	0.000476	0.233	0.208	129.642	audio_features	4.0
4764	3BcN2Pcy0kTG1zm8Tz9MsB	Jaden	Your Voice / Bethel, NY	213626	pop rap	0.560	0.344	3.0	-12.283	1.0	0.0306	0.866	0.001470	0.111	0.428	115.393	audio_features	3.0

	id	artistName	trackName	msPlayed	genre	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	type	time_signature	secondsPlayed
1616	39OqZkxrUiMAR1bG3oquBA	!!!	Even When The Water's Cold	227066	alternative dance	0.681643	1.098189	2.0	0.339436	0.0	-0.490202	-0.765425	0.001310	-0.119223	1.783087	-0.496729	audio_features	4.0	227.066
4204	3M1RZOhzt4lG3vpSYwffhe	!!!	The Hammer	3130	metallic hardcore	-1.923694	1.402014	6.0	0.932038	1.0	0.967882	-1.084631	0.000000	1.540360	0.749205	1.517300	audio_features	4.0	3.130
1848	0j8ppsOOawdPCJnSTcXgOy	$NOT	GOSHA	360000	aesthetic rap	1.550089	-0.474313	0.0	-0.268679	1.0	4.213616	1.337132	0.000013	1.150320	0.221965	-1.358855	audio_features	4.0	360.000
1841	0jZrlNfW35trATveaKIjGA	$ober	GLOCK GLOCK	49386	Unknown	0.060419	1.303476	11.0	1.428428	0.0	-0.327915	-0.968802	0.000029	0.576731	0.951038	1.398451	audio_features	4.0	49.386
7	30QR0ndUdiiMQMA9g1PGCm	$uicideboy$	...And to Those I Love, Thanks for Sticking Ar...	120005	cloud rap	1.201443	-0.215651	2.0	0.334080	1.0	-0.468980	-0.714581	0.000090	-0.264532	-1.330918	-0.185947	audio_features	4.0	120.005

	loudness	speechiness	acousticness	liveness	mood	rhythm	instrumentalness
cluster
0	-1.775285	-0.330006	1.372767	-0.308365	1.656999	0.655163	0.696517
1	0.225289	-0.147018	0.018784	0.069363	0.583906	0.026792	0.011993
2	0.282476	0.053466	-0.250158	0.065126	0.233569	-0.099945	0.071913
3	0.074329	0.058683	0.294546	-0.056932	0.257888	-0.047858	0.005694
4	-2.541869	-0.224595	1.573742	-0.200291	0.781942	-0.175242	0.906191
5	0.750867	-0.022691	-0.782197	-0.055159	0.676380	-0.019161	0.001920
6	0.899508	0.545054	-0.928754	0.602351	-0.179880	-0.286601	0.052833
7	0.787859	0.603462	-0.759373	0.329471	0.299916	0.215809	0.522359
8	0.471183	-0.007386	-0.747758	0.645521	0.200228	0.031946	0.070660
9	0.121553	0.065440	-0.039804	-0.237765	0.349221	-0.183768	0.007308
10	0.051933	-0.389362	-0.014622	-0.325825	0.613220	0.127439	0.096133
11	0.422109	0.068872	-0.243194	-0.150455	0.301399	-0.078556	0.007022

Introduction¶

Libraries and Configurations¶

Data Cleaning¶

Descriptive Statistics and Visualization¶

Machine Learning¶

Insights¶

Feature Engineering and Model Improvement¶

Conclusions¶

Appendix - Improvements¶