# Set-up parametersthreshold_year<-2018# Exclude older years from some anlaysissave_csv <-TRUE# Export stream data frame as csvsave_svg <-TRUE# Export ggplots as svg files# Required librarieslibrary(jsonlite) # Working with JSON datalibrary(dplyr) # Data transformationlibrary(tidyr) # Data cleaninglibrary(lubridate) # Handling dates and timeslibrary(ggplot2) # Plot graphslibrary(knitr) # Report formattingif(save_svg) library(svglite) # Create SVG files# Colour palettepal<-c("#3abdaa","#7b2458","#facd00","#41658a","#e63946","#b2a3b5","#264653")
Data preparation
Code
# Find json streaming history filesstream_files<-list.files("input_files/", pattern="Streaming_History_Audio")#Merge all streaming history files into a data frame#Read and parse playlist data from the first JSON filestream<-fromJSON(paste0("input_files/",stream_files[1]), flatten =TRUE)#loop allfor(i in2:length(stream_files)){ stream_i <-fromJSON(paste0("input_files/",stream_files[i]), flatten =TRUE) stream<-merge(stream, stream_i, all =TRUE)}
# Calculate total listening hours by yearby_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60,n_tracks=n(),n_songs=n_distinct(track_name),n_artists=n_distinct(artist_name))# Display tableby_year%>%transmute(Year=year,`Hours played`=round(h_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`N artists`= n_artists )%>%kable(caption="Spotify listened by year")
Spotify listened by year
Year
Hours played
N tracks
N songs
N artists
2018
304.21
5741
3108
1023
2019
296.46
6081
2801
856
2020
595.35
10926
5431
2037
2021
470.17
9460
3818
1528
2022
411.07
8286
3583
1695
2023
377.25
10017
4804
1670
2024
799.84
17293
5226
2035
By device
Code
# Calculate total listening hours by year and deviceby_device <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(Device, year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60)# Create stacked bar chart of listening hours by device and yearplot_device<-by_device%>%ggplot(aes(x = year, y = h_played, fill = Device)) +geom_col(position =position_stack(reverse =TRUE)) +# Stack bars with reverse orderscale_fill_manual(values = pal, na.value ="grey80") +# Custom color palettetheme_minimal() +ggtitle("Hours listening to Spotify") +ylab("")# Display plotplot_device
Code
# Save plotif(save_svg) ggsave(paste0("plot_device",".svg"), path ="output_files/")
Shuffle mode
Code
# Calculate tracks played in shuffle mode by yearby_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =as.factor(year(date)), shuffle) %>%summarise(n =n()) %>%mutate(`Proportion of tracks`= n /sum(n),Shuffle =ifelse(shuffle, "Yes", "No"))# Create stacked bar chart of tracks played in shuffle mode by yearplot_shuffle<-by_shuffle%>%ggplot(aes(x = year, y =`Proportion of tracks`, fill=Shuffle)) +geom_col() +scale_fill_manual(values = pal, na.value ="grey80") +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +# Convert y-axis to percentagesggtitle("Proportion of tracks listened in shuffle mode") +ylab("")# Display plotplot_shuffle
Code
# Save plotif(save_svg) ggsave(paste0("plot_shuffle",".svg"), path ="output_files/")
Track start and end
Code
# Create consistent palettes for track start and end visualization# Sort unique end and start valuesnames_end <-sort(unique(stream$End), decreasing =TRUE)names_start <-sort(unique(stream$Start), decreasing =TRUE)# Create palette for end valuespal_end <- pal[1:length(names_end)]names(pal_end) <- names_end# Find overlapping colors between start and endpal_start_end <- pal_end[names_end %in% names_start]# Create palette for start valuespal_start <- pal[!pal %in% pal_start_end]names(pal_start) <- names_start[!names_start %in% names_end]# Combine palettespal_start <-c(pal_start_end, pal_start)
Code
# Create a stacked bar chart showing how tracks started (shuffled vs non-shuffled)by_reason_start_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`Start`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n))plot_reason_start_shuffle<-by_reason_start_shuffle%>%ggplot(aes(x = year, y = n, fill=`Start`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_start) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks started") +ylab("Proportion of tracks") +theme(legend.title=element_blank())# Display plotplot_reason_start_shuffle
Code
# Save plotif(save_svg) ggsave(paste0("plot_reason_start_shuffle",".svg"), path ="output_files/")
Code
# Calculate number of tracks listenede by year and suffle modeby_reason_end_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`End`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n))# Create a stacked bar chart showing how tracks ended (shuffled vs non-shuffled)plot_reason_end_shuffle<-by_reason_end_shuffle%>%ggplot(aes(x = year, y = n, fill=`End`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_end) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks ended") +ylab("Proportion of tracks") +theme(legend.title=element_blank())# Display plotplot_reason_end_shuffle
Code
# Save plotif(save_svg) ggsave(paste0("plot_reason_end_shuffle",".svg"), path ="output_files/")
Listening peaks
Hours per day calendar
Code
# Create a dataframe with all days in the date rangedate_range <- stream %>%filter(year(date)>=threshold_year) # Filter data from threshold year onwardsall_days <-data.frame(day_date =seq(as.Date(floor_date(min(date_range$date), unit="year")),as.Date(ceiling_date(max(date_range$date), unit="year")-1),by="days")) %>%mutate(day_n =row_number())# Calculate first weekday of each yearfirst_week_day <- all_days %>%mutate(weekday =as.POSIXlt(day_date)$wday,weekday =ifelse(weekday==0, 7, weekday),year =year(day_date)) %>%group_by(year) %>%summarize(first_week_day =first(weekday)-1,days_year =n())# Process streaming data by dayby_day_year <- date_range %>%mutate(day_date =date(date_range$date)) %>%right_join(all_days) %>%group_by(day_date, day_n) %>%# Calculate daily metricssummarise(h_played =sum(ms_played)/1000/60/60, # Convert ms to hoursn_tracks =n(),computer =mean(Device=="Computer", na.rm=TRUE),shuffle =mean(shuffle, na.rm=TRUE),incognito_mode =mean(incognito_mode, na.rm=TRUE),offline =mean(offline, na.rm=TRUE), ) %>%# Add calendar columnsmutate(h_played =ifelse(is.na(h_played), 0, h_played),year =year(day_date),month =month(day_date),week =week(day_date),day_year =yday(day_date),weekday =as.POSIXlt(day_date)$wday,weekday =factor(ifelse(weekday==0, 7, weekday), levels=c(1:7)) ) %>%left_join(first_week_day) %>%mutate(calendar_row =ceiling((first_week_day + day_year)/7) )# Create calendar heatmapplot_day_year<-by_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=h_played)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value ="grey95") +ggtitle("Hours listened per day") +theme(legend.title=element_blank()) +ylab("")# Display plotplot_day_year
Code
# Save plot if save_svg is TRUEif(save_svg) ggsave(paste0("plot_day_year",".svg"), width=7, height=8, path="output_files/")
Top days
Code
# Create summary of number of times a track was listened in a daydate_range <- stream %>%filter(year(date)>=threshold_year) # Filter data from threshold year onwardsby_top_track_day <- date_range %>%mutate(day_date =date(date_range$date)) %>%group_by(track_name, day_date) %>%summarise(n_tracks =n())%>%ungroup() %>%arrange(desc(n_tracks))# Get top 10 days by hours playedby_top_day <- date_range %>%mutate(day_date =date(date_range$date)) %>%group_by(day_date) %>%summarise(n_tracks_all =n(),computer =mean(Device =="Computer"),shuffle =mean(shuffle),h_played =sum(ms_played)/1000/60/60 ) %>%ungroup() %>%arrange(desc(h_played)) %>%slice_max(h_played, n =10)# Join the data and calculate final metricsby_top_day <-by_top_day %>%left_join(by_top_track_day) %>%group_by(day_date) %>%summarize(mult_times =mean(n_tracks >1)) %>%left_join(by_top_day) %>%arrange(desc(h_played)) %>%relocate(day_date, h_played, n_tracks = n_tracks_all)# Display tableby_top_day%>%transmute(Date=day_date,Hours=round(h_played,2),`N tracks`=n_tracks,`% tracks replayed`=paste0(round(mult_times,4)*100, "%"))%>%kable(caption="Top 10 days by listening hours")
Top 10 days by listening hours
Date
Hours
N tracks
% tracks replayed
2024-01-28
16.16
272
58.82%
2024-01-29
10.34
187
37.11%
2021-11-23
10.27
205
33.85%
2024-01-13
10.11
184
21.53%
2023-12-16
9.34
243
15.12%
2024-01-12
8.64
156
6.21%
2024-01-15
8.32
172
13.48%
2024-03-25
7.95
155
16.79%
2019-04-12
7.62
127
37.65%
2020-05-07
7.56
149
19.2%
Minutes listened by hour of the day
Code
# Create a dataframe with all possible hours for each dayall_hours <- all_days %>%expand(day_date, hour=1:24)date_range <- stream %>%filter(year(date)>=threshold_year) # Filter data from threshold year onwards# Calculate average listening time by hour and yearby_hour_year <- date_range %>%mutate(day_date =date(date_range$date),hour =hour(date)) %>%right_join(all_hours) %>%group_by(day_date, hour) %>%mutate(ms_played =ifelse(is.na(ms_played), 0, ms_played)) %>%summarise(min_played =sum(ms_played/1000/60)) %>%group_by(year =year(day_date), hour) %>%summarise(min_played =mean(min_played))# Create visualization of average listening time by hour and yearplot_hour_year <- by_hour_year %>%ggplot(aes(x = year, y = hour, fill = min_played)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(hour), scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Minutes listened per hour of the day (on average)") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank())# Display plotplot_hour_year
Code
# Save plotif(save_svg) ggsave(paste0("plot_hour_year",".svg"), width =7, height =5, path ="output_files/")
What have I listened to in Spotify?
Top artists
Code
# Calculate statistics by yearby_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =year(date)) %>%summarise(min_played_year =sum(ms_played)/1000/60,n_tracks_year =n(),n_songs_year =n_distinct(track_name) )# Calculate statistics by artist and yearby_artist_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(artist_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n(),n_songs =n_distinct(track_name),p_tracks =n()/sum(n_tracks) ) %>%# Split multiple artists into separate rowsseparate_wider_delim( artist_name,", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%left_join(by_year) %>%filter(!is.na(artist_name)) %>%group_by(artist_name, year) %>%summarise(min_played =sum(min_played),n_tracks =sum(n_tracks),n_songs =sum(n_songs),p_tracks = n_tracks/n_tracks_year,p_min_played = min_played/min_played_year,p_songs = n_songs/n_songs_year ) %>%arrange(year,desc(n_tracks))by_artist_year %>%group_by(Year = year) %>%slice_max(n_tracks, n =3)%>%transmute(Artist = artist_name,`Minutes played`=round(min_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`% of the annual time`=paste0(round(p_min_played,4)*100, "%"),`% of the annual tracks`=paste0(round(p_tracks,4)*100, "%"),`% of the annual songs`=paste0(round(p_songs,4)*100, "%"), )%>%kable(caption ="Top 3 artists by year")
Top 3 artists by year
Year
Artist
Minutes played
N tracks
N songs
% of the annual time
% of the annual tracks
% of the annual songs
2018
Ayax y Prok
1239.58
453
44
6.79%
7.89%
1.42%
2018
Arctic Monkeys
590.24
187
60
3.23%
3.26%
1.93%
2018
Bejo
378.36
169
24
2.07%
2.94%
0.77%
2019
George Ezra
871.16
326
31
4.9%
5.36%
1.11%
2019
Ayax y Prok
623.96
237
43
3.51%
3.9%
1.54%
2019
Marea
782.26
216
42
4.4%
3.55%
1.5%
2020
Mac Quayle
1211.13
421
163
3.39%
3.85%
3%
2020
Bejo
917.55
374
39
2.57%
3.42%
0.72%
2020
Mac Miller
821.91
210
60
2.3%
1.92%
1.1%
2021
Ayax y Prok
1108.59
398
41
3.93%
4.21%
1.07%
2021
C. Tangana
887.91
383
39
3.15%
4.05%
1.02%
2021
Kase.O
695.49
253
33
2.47%
2.67%
0.86%
2022
Bad Bunny
841.25
281
30
3.41%
3.39%
0.84%
2022
Estopa
794.53
270
65
3.22%
3.26%
1.81%
2022
Ayax y Prok
581.29
238
45
2.36%
2.87%
1.26%
2023
Los gandules
574.14
431
124
2.54%
4.3%
2.58%
2023
Marea
997.79
336
49
4.41%
3.35%
1.02%
2023
Extremoduro
433.73
184
78
1.92%
1.84%
1.62%
2024
La Maravillosa Orquesta del Alcohol
1073.94
443
33
2.24%
2.56%
0.63%
2024
Estopa
1058.58
404
43
2.21%
2.34%
0.82%
2024
Arde Bogotá
825.37
283
26
1.72%
1.64%
0.5%
Code
# Get top 40 artists by yeartop_artist_year <- by_artist_year %>%ungroup() %>%slice_max(n_tracks, n =40) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played),n_songs_all=sum(n_songs)) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all) %>%distinct()# Calculate statistics by artistby_artist_all <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwards# Split multiple artists into separate rowsseparate_wider_delim( artist_name,", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%group_by(artist_name) %>%summarise(min_played_all =sum(ms_played/1000/60),n_tracks_all =n(),n_songs_all =n_distinct(track_name),p_tracks_all =n()/sum(n()) ) %>%filter(!is.na(artist_name)) %>%arrange(desc(n_tracks_all))# Get top 40 artists overalltop_artist_all<-by_artist_all %>%slice_max(n_tracks_all, n =40) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all)# Display tabletop_artist_all[1:10,]%>%transmute(Artist=artist_name,`Minutes played`=round(min_played_all,2),`N tracks`= n_tracks_all,`N songs`= n_songs_all)%>%kable(caption ="Top 10 artists (all years)")
Top 10 artists (all years)
Artist
Minutes played
N tracks
N songs
Ayax y Prok
4355.56
1661
132
C. Tangana
2735.39
1145
71
Bejo
2553.25
1131
68
La Maravillosa Orquesta del Alcohol
2620.12
1018
118
Arctic Monkeys
2693.23
965
109
Estopa
2510.11
929
99
Los gandules
1329.98
879
155
Marea
2812.18
847
95
Extremoduro
2772.74
835
123
Kase.O
2306.25
807
79
Code
# Get top 40 artists overall or by yeartop_artist_global<-unique(c(top_artist_year$artist_name, top_artist_all$artist_name))# Join top artists data and filter top artistsby_top_artist <- by_artist_year %>%filter(artist_name%in%top_artist_global) %>%left_join(by_artist_all) %>%arrange(desc(n_tracks))
Global
Code
# Create a heatmap (not filtering top_artist)plot_top_artist_year<-by_top_artist%>%ggplot(aes(x = year,y =reorder(artist_name, -n_tracks_all),fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year),rows =vars(reorder(artist_name, -n_tracks_all)),scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank())# Display plotplot_top_artist_year
Code
# Save plotif(save_svg) ggsave(paste0("plot_top_artist_year",".svg"), width =7, height =9, path ="output_files/")
Global + Top 10 by year
Code
# Get top 10 artists for each yeartop_artist_year_partial <- by_artist_year %>%group_by(year) %>%slice_max(n_tracks, n =10) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%select(artist_name) %>%distinct()# Combine different top artist liststop_artist_all <- by_artist_year %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%slice_max(n_tracks_all, n =40) %>%select(artist_name, n_tracks_all) %>%bind_rows(top_artist_year) %>%bind_rows(top_artist_year_partial) %>%select(artist_name) %>%distinct()
Code
# Prepare data for matrix visualizationby_top_artist_matrix <- by_artist_year %>%right_join(top_artist_all) %>%select(artist_name, year, p_tracks) %>%arrange(year, desc(p_tracks))# Create wide format matrix# (this is redundant, but I was stuck geting the artist order right)matrix <- tidyr::pivot_wider(data = by_top_artist_matrix,id_cols = artist_name,names_from = year,values_from = p_tracks)# Set ordering for visualizationrow_order <- matrix$artist_namecol_order <-names(matrix)[-1]# Create heatmap visualizationplot_top_artist_matrix <- by_top_artist_matrix %>%mutate(n_row =row_number(),year =factor(year, levels = col_order),artist_name =factor(artist_name, levels = row_order) ) %>%ggplot(aes(x = year, y = artist_name, fill = p_tracks)) +geom_tile(position ="identity") +theme_void() +facet_grid(cols =vars(year),rows =vars(artist_name),scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )# Display plotplot_top_artist_matrix
# Calculate total play time and count for each tracktop_track <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(track_name, artist_name) %>%summarise(min_played =sum(ms_played/1000/60), # Convert ms to minutesn_tracks =n() ) %>%arrange(desc(n_tracks))%>%ungroup()# Display tabletop_track[1:10,]%>%na.omit(transmute(`Track`= track_name,`Artist`= artist_name,`Minutes played`=round(min_played,2),`N tracks`=`n_tracks` ))%>%kable(caption="Top 10 tracks")
Top 10 tracks
track_name
artist_name
min_played
n_tracks
Mentecato
Bejo
311.1535
150
Tú Me Dejaste De Querer
C. Tangana
343.6406
132
Crîtto de lâ Nabahâ
Califato ¾
369.8610
120
Hablar, Hablar, Hablar…
Los Zigarros
297.4445
110
Fresas con nata
Ayax y Prok
334.4053
101
Watch Out
The Big Push
298.5197
99
No Gires
Varry Brava
195.7154
96
Como Camarón
Estopa
284.7042
92
Neverita
Bad Bunny
203.5799
92
Code
# Calculate statistics by track and yearby_track_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(track_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n() )# Get top 30 tracks by yeartop_track_year <- by_track_year %>%ungroup() %>%slice_max(n_tracks, n =30) %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%select(track_name, n_tracks_all, min_played_all) %>%distinct()# Get top 30 tracks overalltop_track_all <- by_track_year %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%slice_max(n_tracks_all, n =30) %>%select(track_name, n_tracks_all, min_played_all) %>%bind_rows(top_track_year) %>%na.omit(distinct())# Create heatmap visualizationplot_track_year<-by_track_year %>%right_join(top_track_all) %>%arrange(n_tracks) %>%ggplot(aes(x = year, y =reorder(track_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year),rows =vars(reorder(track_name, -n_tracks_all)),scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played tracks") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )# Display plotplot_track_year
Code
# Save plotif(save_svg) ggsave(paste0("plot_track_year",".svg"), width =7, height =9, path ="output_files/")
Manually selected tracks
Code
# Filter and summarize selected tracksby_track_select <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwards# Get only selected tracks that have a namefilter(Start=="Selected", !is.na(track_name)) %>%group_by(track_name, artist_name) %>%summarise(n_selected =n() ) %>%ungroup() %>%# Join with top tracks dataleft_join(top_track) %>%# Calculate percentage of times track was selectedmutate(p_selected = n_selected/n_tracks ) %>%# Get top 20 most selected tracksslice_max(n_selected, n =20) %>%arrange(desc(n_selected))# Display tableby_track_select %>%transmute(`Track`= track_name,`Artist`= artist_name,`Times Selected`= n_selected,`% Selected`=paste0(round(p_selected,4)*100,"%"),`N tracks`=`n_tracks` ) %>%kable(caption="Top selected tracks")
Top selected tracks
Track
Artist
Times Selected
% Selected
N tracks
Mentecato
Bejo
24
16%
150
Nuestra fosa
Marea
16
25.81%
62
Demasiadas Mujeres
C. Tangana
15
16.48%
91
Es miércoles
Spotify
14
56%
25
Moscow Mule
Bad Bunny
14
25.93%
54
Rizando el rizo
Ayax y Prok
13
22.41%
58
Tú Me Dejaste De Querer
C. Tangana
13
9.85%
132
Cada vez cadáver
Fito y Fitipaldis
12
18.18%
66
Café solo
Ayax y Prok
12
17.91%
67
Es lunes
Spotify
12
41.38%
29
Fresas con nata
Ayax y Prok
12
11.88%
101
Think About Things
Daði Freyr
12
30%
40
When You Were Young
The Killers
12
18.46%
65
Buena muerte
Marea
11
25%
44
Dentro De La Ley
Los Zigarros
11
22%
50
Dispárame
Los Zigarros
11
28.21%
39
Es viernes
Spotify
11
42.31%
26
Mañana Es Lunes
Rubén Pozo
11
28.95%
38
Renacimiento
Kase.O
11
12.79%
86
Teddy Picker
Arctic Monkeys
11
15.94%
69
What’s the Use?
Mac Miller
11
25.58%
43
Workingman’s Blues #2
Bob Dylan
11
55%
20
In loop
Times played in one day
Code
# Display tableby_top_track_day[1:10,]%>%na.omit(transmute(Date=day_date,Track=track_name,`N tracks`=n_tracks))%>%kable(caption="Top 10 songs by times played in one day")
Top 10 songs by times played in one day
track_name
day_date
n_tracks
La Danza
2024-08-09
45
Como Camarón
2024-01-28
16
Watch Out
2024-01-28
16
Crystal Fighters
2024-01-28
15
Crîtto de lâ Nabahâ
2024-01-28
15
Decidí - Versión 2004
2024-01-28
15
Héroes del Sábado
2024-01-28
15
Code
# Find tracks that were played more than 5 times in a day# and count how many days this happenedby_top_track_loop<-by_top_track_day%>%filter(n_tracks>5)%>%group_by(track_name)%>%summarise(n_days=n() )%>%ungroup() %>%arrange(desc(n_days))%>%filter(n_days>2,!is.na(track_name))# Display tableby_top_track_loop%>%transmute(`Track`=track_name,`Days in loop (times played>5)`=n_days)%>%kable(caption="Tracks played more than 5 times several days")
Tracks played more than 5 times several days
Track
Days in loop (times played>5)
Como Camarón
3
Ojalá
3
Playlists and Library tracks
Set up
Code
# Find json streaming history filesplaylist_files<-list.files("input_files/", pattern="Playlist")#Merge all playlist files into a data frame#Read and parse playlist data from the first JSON fileplaylist<-fromJSON(paste0("input_files/",playlist_files[1]), flatten=TRUE)[[1]]playlist_tracks <-data.frame()# Loop through each playlist and extract tracksfor(i in1:nrow(playlist)){ playlist_tracks_i <- playlist$items[[i]] playlist_tracks_i$playlist_name <- playlist$name[i] playlist_tracks <-bind_rows(playlist_tracks, playlist_tracks_i)}# Read library data from JSON filelibrary <-fromJSON("input_files/YourLibrary.json", flatten =TRUE)library_tracks <- library$tracks%>%mutate(artist_name = artist,track_name = track,album_name = album)library_albums <- library$album%>%mutate(artist_name = artist,album_name = album)library <-merge(library$tracks, library$album, all =TRUE)
User playlists
Code
# Process playlist tracks dataplaylist_tracks <- playlist_tracks %>%mutate(added_date =as_datetime(addedDate, tz ="UTC"),artist_name = track.artistName,track_name = track.trackName,album_name = track.albumName )# Get most common tracks in playlistsplaylist_top_tracks <- playlist_tracks %>%group_by(track_name,artist_name) %>%summarise(n_playlists =n()) %>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_tracks[1:10,]%>%transmute(`Track`=track_name,`Artist`= artist_name,`N playlists`= n_playlists, )%>%kable(caption="Tracks included in more playlists")
Tracks included in more playlists
Track
Artist
N playlists
Mentecato
BEJO
15
Hablar, Hablar, Hablar…
Los Zigarros
14
Fluorescent Adolescent
Arctic Monkeys
11
Querría
El Kanka
11
Renacimiento
Kase.O
11
1932
La Maravillosa Orquesta del Alcohol
10
Brianstorm
Arctic Monkeys
10
Como Camarón
Estopa
10
Héroes del Sábado
La Maravillosa Orquesta del Alcohol
10
Madrid
Karavana
10
Code
# Get most common artists in playlistsplaylist_top_artist<-playlist_tracks%>%group_by(artist_name)%>%summarise(n_tracks=n(),n_songs=n_distinct(track_name),n_playlists=n_distinct(playlist_name))%>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N tracks`= n_tracks,`N songs`= n_songs,`N playlists`= n_playlists, )%>%kable(caption="Artists included in more playlists")
Artists included in more playlists
Artist
N tracks
N songs
N playlists
Los Zigarros
117
39
21
Arctic Monkeys
179
68
19
Kase.O
130
44
19
BEJO
182
44
18
La Maravillosa Orquesta del Alcohol
210
96
18
El Kanka
74
23
17
Fito y Fitipaldis
180
64
17
Extremoduro
237
124
16
Leiva
123
54
16
Ayax y Prok
175
72
15
User library
Code
# Get most common artists in library (saved songs)library_tracks_top_artist<-library_tracks%>%group_by(artist_name)%>%summarise(n_songs=n_distinct(track_name))%>%arrange(desc(n_songs))%>%ungroup()# Display tablelibrary_tracks_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N songs`= n_songs, )%>%kable(caption="Artists with more songs in library")
Artists with more songs in library
Artist
N songs
Amor Líquido
1
Doja Cat
1
El Niño de la Hipoteca
1
Fools Garden
1
Gabry Ponte
1
Gala
1
Hinds
1
Jain
1
Juan Antonio Canta
1
Lucio Battisti
1
Code
# Get most common artists in library (saved albums)library_albums_top_artist<-library_albums%>%group_by(artist_name)%>%summarise(n_albums=n_distinct(album_name))%>%arrange(desc(n_albums))%>%ungroup()# Display tablelibrary_albums_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N albums`= n_albums, )%>%kable(caption="Artists with more albums in library")
Artists with more albums in library
Artist
N albums
La Maravillosa Orquesta del Alcohol
8
Extremoduro
7
Arctic Monkeys
6
Marea
6
Vetusta Morla
6
Los Zigarros
5
Supersubmarina
5
Various Artists
5
Biznaga
4
Kase.O
4
Saved in playlist or library
Code
# Combine tracks from playlists and librarysaved_tracks <-unique(c(playlist_top_tracks$track_name, library_tracks$track_name))# Combine artists from playlists and librarysaved_artist <-unique(c(playlist_top_artist$artist_name, library_tracks_top_artist$artist_name, library_albums_top_artist$artist_name))# Plot proportion of tracks in playlists by yearplot_saved_track <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsmutate(`In my playlists`=ifelse(track_name %in% saved_tracks, "Yes", "No")) %>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +theme_minimal() +ggtitle("Were the tracks (streaming history) in the current playlists?") +ylab("")# Display plotplot_saved_track
Code
# Save plotif(save_svg) ggsave(paste0("plot_saved_track",".svg"), path ="output_files/")
Code
# Plot proportion of artists in playlists by yearplot_saved_artist <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsmutate(`In my playlists`=ifelse(artist_name%in%saved_artist,"Yes","No"))%>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%"))+theme_minimal() +ggtitle("Were the artists (streaming history) in the current playlists?")+ylab("")plot_saved_artist
Code
# Save plotif(save_svg) ggsave(paste0("plot_saved_artist",".svg"), path ="output_files/")
Source Code
---title: "Spotify Stream History Analyis"author: "Javier Artiga (forked from Natalia Ciria)"date: last-modifiedformat: html: embed-resources: true code-tools: true code-fold: true code-block-border-left: trueexecute: warning: false message: false---## Set-up```{r}# Set-up parametersthreshold_year<-2018# Exclude older years from some anlaysissave_csv <-TRUE# Export stream data frame as csvsave_svg <-TRUE# Export ggplots as svg files# Required librarieslibrary(jsonlite) # Working with JSON datalibrary(dplyr) # Data transformationlibrary(tidyr) # Data cleaninglibrary(lubridate) # Handling dates and timeslibrary(ggplot2) # Plot graphslibrary(knitr) # Report formattingif(save_svg) library(svglite) # Create SVG files# Colour palettepal<-c("#3abdaa","#7b2458","#facd00","#41658a","#e63946","#b2a3b5","#264653")```## Data preparation```{r}# Find json streaming history filesstream_files<-list.files("input_files/", pattern="Streaming_History_Audio")#Merge all streaming history files into a data frame#Read and parse playlist data from the first JSON filestream<-fromJSON(paste0("input_files/",stream_files[1]), flatten =TRUE)#loop allfor(i in2:length(stream_files)){ stream_i <-fromJSON(paste0("input_files/",stream_files[i]), flatten =TRUE) stream<-merge(stream, stream_i, all =TRUE)}``````{r}# Process Spotify streaming datastream <- stream %>%mutate(# Convert timestamp and rename metadata columnsdate =as_datetime(ts, tz ="UTC"),artist_name = master_metadata_album_artist_name,track_name = master_metadata_track_name,album_name = master_metadata_album_album_name,# Categorize device typesDevice =case_when(grepl("Android-tablet|Android OS|android|iOS|ios", platform) ~"Mobile",grepl("public_js|web_player|WebPlayer|chrome|Windows|windows|Linux|linux", platform) ~"PC",grepl("cast|tv", platform) ~"TV", ),# Categorize track end reasonsEnd =case_when( reason_end %in%c("trackdone", "endplay") ~"Track finished", reason_end =="logout"~"Spotify closed", reason_end =="playbtn"~"Play button", reason_end =="fwdbtn"~"Forward button", reason_end =="backbtn"~"Backward button",.default ="Other" ),# Categorize track start reasonsStart =case_when( reason_start %in%c("clickrow", "click-row") ~"Selected", reason_start =="trackdone"~"Track finished", reason_start =="persisted"~"Persisted", reason_start =="playbtn"~"Play button", reason_start =="fwdbtn"~"Forward button", reason_start =="backbtn"~"Backward button",.default ="Other" ) )# Save processed data if save_csv is TRUEif(save_csv) write.csv(stream, "output_files/stream.csv")```## Annual Spotify hours### Total```{r}# Calculate total listening hours by yearby_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60,n_tracks=n(),n_songs=n_distinct(track_name),n_artists=n_distinct(artist_name))# Display tableby_year%>%transmute(Year=year,`Hours played`=round(h_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`N artists`= n_artists )%>%kable(caption="Spotify listened by year")```### By device```{r}# Calculate total listening hours by year and deviceby_device <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(Device, year =as.factor(year(date))) %>%summarise(h_played =sum(ms_played)/1000/60/60)# Create stacked bar chart of listening hours by device and yearplot_device<-by_device%>%ggplot(aes(x = year, y = h_played, fill = Device)) +geom_col(position =position_stack(reverse =TRUE)) +# Stack bars with reverse orderscale_fill_manual(values = pal, na.value ="grey80") +# Custom color palettetheme_minimal() +ggtitle("Hours listening to Spotify") +ylab("")# Display plotplot_device# Save plotif(save_svg) ggsave(paste0("plot_device",".svg"), path ="output_files/")```## Shuffle mode```{r}# Calculate tracks played in shuffle mode by yearby_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =as.factor(year(date)), shuffle) %>%summarise(n =n()) %>%mutate(`Proportion of tracks`= n /sum(n),Shuffle =ifelse(shuffle, "Yes", "No"))# Create stacked bar chart of tracks played in shuffle mode by yearplot_shuffle<-by_shuffle%>%ggplot(aes(x = year, y =`Proportion of tracks`, fill=Shuffle)) +geom_col() +scale_fill_manual(values = pal, na.value ="grey80") +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +# Convert y-axis to percentagesggtitle("Proportion of tracks listened in shuffle mode") +ylab("")# Display plotplot_shuffle# Save plotif(save_svg) ggsave(paste0("plot_shuffle",".svg"), path ="output_files/")```## Track start and end```{r}# Create consistent palettes for track start and end visualization# Sort unique end and start valuesnames_end <-sort(unique(stream$End), decreasing =TRUE)names_start <-sort(unique(stream$Start), decreasing =TRUE)# Create palette for end valuespal_end <- pal[1:length(names_end)]names(pal_end) <- names_end# Find overlapping colors between start and endpal_start_end <- pal_end[names_end %in% names_start]# Create palette for start valuespal_start <- pal[!pal %in% pal_start_end]names(pal_start) <- names_start[!names_start %in% names_end]# Combine palettespal_start <-c(pal_start_end, pal_start)``````{r}# Create a stacked bar chart showing how tracks started (shuffled vs non-shuffled)by_reason_start_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`Start`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n))plot_reason_start_shuffle<-by_reason_start_shuffle%>%ggplot(aes(x = year, y = n, fill=`Start`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_start) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks started") +ylab("Proportion of tracks") +theme(legend.title=element_blank())# Display plotplot_reason_start_shuffle# Save plotif(save_svg) ggsave(paste0("plot_reason_start_shuffle",".svg"), path ="output_files/")``````{r}# Calculate number of tracks listenede by year and suffle modeby_reason_end_shuffle <- stream %>%filter(year(date)>=threshold_year) %>%mutate(Shuffle=ifelse(shuffle,"In shuffle mode","Not in shuffle mode")) %>%group_by(year =as.factor(year(date)),`End`, Shuffle) %>%summarise(n =n()) %>%ungroup() %>%arrange(desc(n))# Create a stacked bar chart showing how tracks ended (shuffled vs non-shuffled)plot_reason_end_shuffle<-by_reason_end_shuffle%>%ggplot(aes(x = year, y = n, fill=`End`)) +geom_col(position="fill") +facet_grid(cols=vars(Shuffle)) +# Split by shuffle modescale_fill_manual(values = pal_end) +theme_minimal() +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +ggtitle("How tracks ended") +ylab("Proportion of tracks") +theme(legend.title=element_blank())# Display plotplot_reason_end_shuffle# Save plotif(save_svg) ggsave(paste0("plot_reason_end_shuffle",".svg"), path ="output_files/")```## Listening peaks### Hours per day calendar```{r}#| fig.height: 7.6# Create a dataframe with all days in the date rangedate_range <- stream %>%filter(year(date)>=threshold_year) # Filter data from threshold year onwardsall_days <-data.frame(day_date =seq(as.Date(floor_date(min(date_range$date), unit="year")),as.Date(ceiling_date(max(date_range$date), unit="year")-1),by="days")) %>%mutate(day_n =row_number())# Calculate first weekday of each yearfirst_week_day <- all_days %>%mutate(weekday =as.POSIXlt(day_date)$wday,weekday =ifelse(weekday==0, 7, weekday),year =year(day_date)) %>%group_by(year) %>%summarize(first_week_day =first(weekday)-1,days_year =n())# Process streaming data by dayby_day_year <- date_range %>%mutate(day_date =date(date_range$date)) %>%right_join(all_days) %>%group_by(day_date, day_n) %>%# Calculate daily metricssummarise(h_played =sum(ms_played)/1000/60/60, # Convert ms to hoursn_tracks =n(),computer =mean(Device=="Computer", na.rm=TRUE),shuffle =mean(shuffle, na.rm=TRUE),incognito_mode =mean(incognito_mode, na.rm=TRUE),offline =mean(offline, na.rm=TRUE), ) %>%# Add calendar columnsmutate(h_played =ifelse(is.na(h_played), 0, h_played),year =year(day_date),month =month(day_date),week =week(day_date),day_year =yday(day_date),weekday =as.POSIXlt(day_date)$wday,weekday =factor(ifelse(weekday==0, 7, weekday), levels=c(1:7)) ) %>%left_join(first_week_day) %>%mutate(calendar_row =ceiling((first_week_day + day_year)/7) )# Create calendar heatmapplot_day_year<-by_day_year %>%ggplot(aes(x = weekday, y =-calendar_row, fill=h_played)) +geom_tile() +facet_grid(cols=vars(year), rows=vars(month), scales ="free") +theme_void() +scale_fill_gradientn(colors=c("grey95", pal[c(1,3,2)]), na.value ="grey95") +ggtitle("Hours listened per day") +theme(legend.title=element_blank()) +ylab("")# Display plotplot_day_year# Save plot if save_svg is TRUEif(save_svg) ggsave(paste0("plot_day_year",".svg"), width=7, height=8, path="output_files/")```### Top days```{r}# Create summary of number of times a track was listened in a daydate_range <- stream %>%filter(year(date)>=threshold_year) # Filter data from threshold year onwardsby_top_track_day <- date_range %>%mutate(day_date =date(date_range$date)) %>%group_by(track_name, day_date) %>%summarise(n_tracks =n())%>%ungroup() %>%arrange(desc(n_tracks))# Get top 10 days by hours playedby_top_day <- date_range %>%mutate(day_date =date(date_range$date)) %>%group_by(day_date) %>%summarise(n_tracks_all =n(),computer =mean(Device =="Computer"),shuffle =mean(shuffle),h_played =sum(ms_played)/1000/60/60 ) %>%ungroup() %>%arrange(desc(h_played)) %>%slice_max(h_played, n =10)# Join the data and calculate final metricsby_top_day <-by_top_day %>%left_join(by_top_track_day) %>%group_by(day_date) %>%summarize(mult_times =mean(n_tracks >1)) %>%left_join(by_top_day) %>%arrange(desc(h_played)) %>%relocate(day_date, h_played, n_tracks = n_tracks_all)# Display tableby_top_day%>%transmute(Date=day_date,Hours=round(h_played,2),`N tracks`=n_tracks,`% tracks replayed`=paste0(round(mult_times,4)*100, "%"))%>%kable(caption="Top 10 days by listening hours")```### Minutes listened by hour of the day```{r}# Create a dataframe with all possible hours for each dayall_hours <- all_days %>%expand(day_date, hour=1:24)date_range <- stream %>%filter(year(date)>=threshold_year) # Filter data from threshold year onwards# Calculate average listening time by hour and yearby_hour_year <- date_range %>%mutate(day_date =date(date_range$date),hour =hour(date)) %>%right_join(all_hours) %>%group_by(day_date, hour) %>%mutate(ms_played =ifelse(is.na(ms_played), 0, ms_played)) %>%summarise(min_played =sum(ms_played/1000/60)) %>%group_by(year =year(day_date), hour) %>%summarise(min_played =mean(min_played))# Create visualization of average listening time by hour and yearplot_hour_year <- by_hour_year %>%ggplot(aes(x = year, y = hour, fill = min_played)) +geom_tile() +theme_void() +facet_grid(cols =vars(year), rows =vars(hour), scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Minutes listened per hour of the day (on average)") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank())# Display plotplot_hour_year# Save plotif(save_svg) ggsave(paste0("plot_hour_year",".svg"), width =7, height =5, path ="output_files/")```## What have I listened to in Spotify?### Top artists```{r}# Calculate statistics by yearby_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(year =year(date)) %>%summarise(min_played_year =sum(ms_played)/1000/60,n_tracks_year =n(),n_songs_year =n_distinct(track_name) )# Calculate statistics by artist and yearby_artist_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(artist_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n(),n_songs =n_distinct(track_name),p_tracks =n()/sum(n_tracks) ) %>%# Split multiple artists into separate rowsseparate_wider_delim( artist_name,", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%left_join(by_year) %>%filter(!is.na(artist_name)) %>%group_by(artist_name, year) %>%summarise(min_played =sum(min_played),n_tracks =sum(n_tracks),n_songs =sum(n_songs),p_tracks = n_tracks/n_tracks_year,p_min_played = min_played/min_played_year,p_songs = n_songs/n_songs_year ) %>%arrange(year,desc(n_tracks))by_artist_year %>%group_by(Year = year) %>%slice_max(n_tracks, n =3)%>%transmute(Artist = artist_name,`Minutes played`=round(min_played,2),`N tracks`= n_tracks,`N songs`= n_songs,`% of the annual time`=paste0(round(p_min_played,4)*100, "%"),`% of the annual tracks`=paste0(round(p_tracks,4)*100, "%"),`% of the annual songs`=paste0(round(p_songs,4)*100, "%"), )%>%kable(caption ="Top 3 artists by year")# Get top 40 artists by yeartop_artist_year <- by_artist_year %>%ungroup() %>%slice_max(n_tracks, n =40) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played),n_songs_all=sum(n_songs)) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all) %>%distinct()# Calculate statistics by artistby_artist_all <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwards# Split multiple artists into separate rowsseparate_wider_delim( artist_name,", ",names =paste0("artist_name", 1:5),too_few ="align_start",too_many ="drop" ) %>%pivot_longer(cols =starts_with("artist_name"),names_to ="artist_name_n",values_to ="artist_name" ) %>%group_by(artist_name) %>%summarise(min_played_all =sum(ms_played/1000/60),n_tracks_all =n(),n_songs_all =n_distinct(track_name),p_tracks_all =n()/sum(n()) ) %>%filter(!is.na(artist_name)) %>%arrange(desc(n_tracks_all))# Get top 40 artists overalltop_artist_all<-by_artist_all %>%slice_max(n_tracks_all, n =40) %>%select(artist_name,min_played_all, n_tracks_all, n_songs_all)# Display tabletop_artist_all[1:10,]%>%transmute(Artist=artist_name,`Minutes played`=round(min_played_all,2),`N tracks`= n_tracks_all,`N songs`= n_songs_all)%>%kable(caption ="Top 10 artists (all years)")# Get top 40 artists overall or by yeartop_artist_global<-unique(c(top_artist_year$artist_name, top_artist_all$artist_name))# Join top artists data and filter top artistsby_top_artist <- by_artist_year %>%filter(artist_name%in%top_artist_global) %>%left_join(by_artist_all) %>%arrange(desc(n_tracks))```#### Global```{r}#| fig.height: 9# Create a heatmap (not filtering top_artist)plot_top_artist_year<-by_top_artist%>%ggplot(aes(x = year,y =reorder(artist_name, -n_tracks_all),fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year),rows =vars(reorder(artist_name, -n_tracks_all)),scales ="free") +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank())# Display plotplot_top_artist_year# Save plotif(save_svg) ggsave(paste0("plot_top_artist_year",".svg"), width =7, height =9, path ="output_files/")```#### Global + Top 10 by year```{r}# Get top 10 artists for each yeartop_artist_year_partial <- by_artist_year %>%group_by(year) %>%slice_max(n_tracks, n =10) %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%select(artist_name) %>%distinct()# Combine different top artist liststop_artist_all <- by_artist_year %>%group_by(artist_name) %>%summarise(n_tracks_all =sum(n_tracks)) %>%slice_max(n_tracks_all, n =40) %>%select(artist_name, n_tracks_all) %>%bind_rows(top_artist_year) %>%bind_rows(top_artist_year_partial) %>%select(artist_name) %>%distinct()``````{r}#| fig.height: 15# Prepare data for matrix visualizationby_top_artist_matrix <- by_artist_year %>%right_join(top_artist_all) %>%select(artist_name, year, p_tracks) %>%arrange(year, desc(p_tracks))# Create wide format matrix# (this is redundant, but I was stuck geting the artist order right)matrix <- tidyr::pivot_wider(data = by_top_artist_matrix,id_cols = artist_name,names_from = year,values_from = p_tracks)# Set ordering for visualizationrow_order <- matrix$artist_namecol_order <-names(matrix)[-1]# Create heatmap visualizationplot_top_artist_matrix <- by_top_artist_matrix %>%mutate(n_row =row_number(),year =factor(year, levels = col_order),artist_name =factor(artist_name, levels = row_order) ) %>%ggplot(aes(x = year, y = artist_name, fill = p_tracks)) +geom_tile(position ="identity") +theme_void() +facet_grid(cols =vars(year),rows =vars(artist_name),scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played artists") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )# Display plotplot_top_artist_matrix# Save plotif(save_svg) ggsave(paste0("plot_top_artist_matrix", ".svg"), width =8, height =15, path ="output_files/")```### Top tracks#### Most listened```{r}# Calculate total play time and count for each tracktop_track <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(track_name, artist_name) %>%summarise(min_played =sum(ms_played/1000/60), # Convert ms to minutesn_tracks =n() ) %>%arrange(desc(n_tracks))%>%ungroup()# Display tabletop_track[1:10,]%>%na.omit(transmute(`Track`= track_name,`Artist`= artist_name,`Minutes played`=round(min_played,2),`N tracks`=`n_tracks` ))%>%kable(caption="Top 10 tracks")``````{r}#| fig.height: 10# Calculate statistics by track and yearby_track_year <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsgroup_by(track_name, year =year(date)) %>%summarise(min_played =sum(ms_played/1000/60),n_tracks =n() )# Get top 30 tracks by yeartop_track_year <- by_track_year %>%ungroup() %>%slice_max(n_tracks, n =30) %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%select(track_name, n_tracks_all, min_played_all) %>%distinct()# Get top 30 tracks overalltop_track_all <- by_track_year %>%group_by(track_name) %>%summarise(n_tracks_all =sum(n_tracks),min_played_all =sum(min_played) ) %>%slice_max(n_tracks_all, n =30) %>%select(track_name, n_tracks_all, min_played_all) %>%bind_rows(top_track_year) %>%na.omit(distinct())# Create heatmap visualizationplot_track_year<-by_track_year %>%right_join(top_track_all) %>%arrange(n_tracks) %>%ggplot(aes(x = year, y =reorder(track_name, -n_tracks_all), fill = n_tracks)) +geom_tile() +theme_void() +facet_grid(cols =vars(year),rows =vars(reorder(track_name, -n_tracks_all)),scales ="free" ) +scale_fill_gradientn(colors =c("grey95", pal[c(1,3,2)])) +ggtitle("Times listened to most played tracks") +theme(legend.title =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )# Display plotplot_track_year# Save plotif(save_svg) ggsave(paste0("plot_track_year",".svg"), width =7, height =9, path ="output_files/")```#### Manually selected tracks```{r}# Filter and summarize selected tracksby_track_select <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwards# Get only selected tracks that have a namefilter(Start=="Selected", !is.na(track_name)) %>%group_by(track_name, artist_name) %>%summarise(n_selected =n() ) %>%ungroup() %>%# Join with top tracks dataleft_join(top_track) %>%# Calculate percentage of times track was selectedmutate(p_selected = n_selected/n_tracks ) %>%# Get top 20 most selected tracksslice_max(n_selected, n =20) %>%arrange(desc(n_selected))# Display tableby_track_select %>%transmute(`Track`= track_name,`Artist`= artist_name,`Times Selected`= n_selected,`% Selected`=paste0(round(p_selected,4)*100,"%"),`N tracks`=`n_tracks` ) %>%kable(caption="Top selected tracks")```#### In loopTimes played in one day```{r}# Display tableby_top_track_day[1:10,]%>%na.omit(transmute(Date=day_date,Track=track_name,`N tracks`=n_tracks))%>%kable(caption="Top 10 songs by times played in one day")``````{r}# Find tracks that were played more than 5 times in a day# and count how many days this happenedby_top_track_loop<-by_top_track_day%>%filter(n_tracks>5)%>%group_by(track_name)%>%summarise(n_days=n() )%>%ungroup() %>%arrange(desc(n_days))%>%filter(n_days>2,!is.na(track_name))# Display tableby_top_track_loop%>%transmute(`Track`=track_name,`Days in loop (times played>5)`=n_days)%>%kable(caption="Tracks played more than 5 times several days")```## Playlists and Library tracks### Set up```{r}# Find json streaming history filesplaylist_files<-list.files("input_files/", pattern="Playlist")#Merge all playlist files into a data frame#Read and parse playlist data from the first JSON fileplaylist<-fromJSON(paste0("input_files/",playlist_files[1]), flatten=TRUE)[[1]]playlist_tracks <-data.frame()# Loop through each playlist and extract tracksfor(i in1:nrow(playlist)){ playlist_tracks_i <- playlist$items[[i]] playlist_tracks_i$playlist_name <- playlist$name[i] playlist_tracks <-bind_rows(playlist_tracks, playlist_tracks_i)}# Read library data from JSON filelibrary <-fromJSON("input_files/YourLibrary.json", flatten =TRUE)library_tracks <- library$tracks%>%mutate(artist_name = artist,track_name = track,album_name = album)library_albums <- library$album%>%mutate(artist_name = artist,album_name = album)library <-merge(library$tracks, library$album, all =TRUE)```### User playlists```{r}# Process playlist tracks dataplaylist_tracks <- playlist_tracks %>%mutate(added_date =as_datetime(addedDate, tz ="UTC"),artist_name = track.artistName,track_name = track.trackName,album_name = track.albumName )# Get most common tracks in playlistsplaylist_top_tracks <- playlist_tracks %>%group_by(track_name,artist_name) %>%summarise(n_playlists =n()) %>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_tracks[1:10,]%>%transmute(`Track`=track_name,`Artist`= artist_name,`N playlists`= n_playlists, )%>%kable(caption="Tracks included in more playlists")``````{r}# Get most common artists in playlistsplaylist_top_artist<-playlist_tracks%>%group_by(artist_name)%>%summarise(n_tracks=n(),n_songs=n_distinct(track_name),n_playlists=n_distinct(playlist_name))%>%arrange(desc(n_playlists))%>%ungroup()# Display tableplaylist_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N tracks`= n_tracks,`N songs`= n_songs,`N playlists`= n_playlists, )%>%kable(caption="Artists included in more playlists")```### User library```{r}# Get most common artists in library (saved songs)library_tracks_top_artist<-library_tracks%>%group_by(artist_name)%>%summarise(n_songs=n_distinct(track_name))%>%arrange(desc(n_songs))%>%ungroup()# Display tablelibrary_tracks_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N songs`= n_songs, )%>%kable(caption="Artists with more songs in library")# Get most common artists in library (saved albums)library_albums_top_artist<-library_albums%>%group_by(artist_name)%>%summarise(n_albums=n_distinct(album_name))%>%arrange(desc(n_albums))%>%ungroup()# Display tablelibrary_albums_top_artist[1:10,]%>%transmute(`Artist`= artist_name,`N albums`= n_albums, )%>%kable(caption="Artists with more albums in library")```### Saved in playlist or library```{r}# Combine tracks from playlists and librarysaved_tracks <-unique(c(playlist_top_tracks$track_name, library_tracks$track_name))# Combine artists from playlists and librarysaved_artist <-unique(c(playlist_top_artist$artist_name, library_tracks_top_artist$artist_name, library_albums_top_artist$artist_name))# Plot proportion of tracks in playlists by yearplot_saved_track <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsmutate(`In my playlists`=ifelse(track_name %in% saved_tracks, "Yes", "No")) %>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%")) +theme_minimal() +ggtitle("Were the tracks (streaming history) in the current playlists?") +ylab("")# Display plotplot_saved_track# Save plotif(save_svg) ggsave(paste0("plot_saved_track",".svg"), path ="output_files/")``````{r}# Plot proportion of artists in playlists by yearplot_saved_artist <- stream %>%filter(year(date)>=threshold_year) %>%# Filter data from threshold year onwardsmutate(`In my playlists`=ifelse(artist_name%in%saved_artist,"Yes","No"))%>%group_by(`In my playlists`, year =as.factor(year(date))) %>%summarise(n =n()) %>%ggplot(aes(x = year, y = n, fill =`In my playlists`)) +geom_col(position ="fill") +scale_fill_manual(values = pal, na.value ="grey80") +scale_y_continuous(labels =function(x) paste0(x*100, "%"))+theme_minimal() +ggtitle("Were the artists (streaming history) in the current playlists?")+ylab("")plot_saved_artist# Save plotif(save_svg) ggsave(paste0("plot_saved_artist",".svg"), path ="output_files/")```