Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from datetime import datetime | |
| import re | |
| # Configuration de la page | |
| st.set_page_config( | |
| page_title="Analyse des tendances YouTube 2025", | |
| page_icon="📊", | |
| layout="wide" | |
| ) | |
| # Fonction pour charger les données | |
| def load_data(): | |
| # Chargement des données | |
| df = pd.read_csv('youtube_channels_2025.csv') | |
| categories = pd.read_csv('category.csv', sep=';') | |
| # Conversion des types de données | |
| numeric_cols = ['Views', 'Likes', 'Comments', 'Channel_subscribers', 'Channel_views', | |
| 'Channel_total_videos', 'duration_seconds', 'like_rate', 'comment_rate', | |
| 'views_per_subscriber', 'vues_par_jour'] | |
| for col in numeric_cols: | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # Conversion de la date de publication | |
| df['Published_date'] = pd.to_datetime(df['Published_date'], errors='coerce') | |
| # Extraction du jour de la semaine et de l'heure | |
| df['day_of_week'] = df['Published_date'].dt.day_name() | |
| df['hour_of_publication'] = df['Published_date'].dt.hour | |
| # Conversion de la durée en minutes | |
| df['duration_minutes'] = df['duration_seconds'] / 60 | |
| # Ajout des noms de catégories | |
| df = df.merge(categories, left_on='category_id', right_on='ID', how='left') | |
| # Nettoyage des données manquantes | |
| df = df.dropna(subset=['Views', 'Likes', 'Category name']) | |
| return df | |
| # Fonction pour créer un histogramme | |
| def create_histogram(df, column, title, color='#1f77b4', nbins=20): | |
| fig = px.histogram(df, x=column, nbins=nbins, title=title, | |
| color_discrete_sequence=[color]) | |
| fig.update_layout(bargap=0.1) | |
| return fig | |
| # Fonction pour créer un graphique en barres | |
| def create_bar_chart(df, x_col, y_col, title, color='#1f77b4'): | |
| fig = px.bar(df, x=x_col, y=y_col, title=title, | |
| color_discrete_sequence=[color]) | |
| fig.update_xaxis(tickangle=45) | |
| return fig | |
| # Fonction pour créer un nuage de points | |
| def create_scatter_plot(df, x_col, y_col, size_col, hover_data, title, color_col=None): | |
| fig = px.scatter(df, x=x_col, y=y_col, size=size_col, | |
| hover_data=hover_data, title=title, | |
| color=color_col if color_col else None) | |
| return fig | |
| # Fonction pour créer une série temporelle | |
| def create_time_series(df, date_col, value_col, title): | |
| daily_stats = df.groupby(df[date_col].dt.date)[value_col].mean().reset_index() | |
| fig = px.line(daily_stats, x=date_col, y=value_col, title=title) | |
| return fig | |
| # Chargement des données | |
| try: | |
| df = load_data() | |
| except FileNotFoundError as e: | |
| st.error(f"Erreur lors du chargement des fichiers : {e}") | |
| st.error("Assurez-vous que les fichiers 'youtube_channels_2025.csv' et 'category.csv' sont présents.") | |
| st.stop() | |
| # Sidebar pour les filtres | |
| st.sidebar.title("🎛️ Filtres") | |
| # Filtre par catégorie | |
| if 'Category name' in df.columns: | |
| categories = sorted(df['Category name'].dropna().unique()) | |
| selected_categories = st.sidebar.multiselect( | |
| "📂 Sélectionner les catégories", | |
| categories, | |
| default=categories[:5] if len(categories) >= 5 else categories | |
| ) | |
| else: | |
| selected_categories = [] | |
| # Filtre par date de publication | |
| if 'Published_date' in df.columns: | |
| min_date = df['Published_date'].min().date() | |
| max_date = df['Published_date'].max().date() | |
| date_range = st.sidebar.date_input( | |
| "📅 Période de publication", | |
| [min_date, max_date], | |
| min_value=min_date, | |
| max_value=max_date | |
| ) | |
| else: | |
| date_range = [] | |
| # Filtre par chaîne YouTube | |
| if 'Channel_name' in df.columns: | |
| channels = sorted(df['Channel_name'].dropna().unique()) | |
| selected_channels = st.sidebar.multiselect( | |
| "📺 Sélectionner les chaînes", | |
| channels, | |
| default=[] | |
| ) | |
| else: | |
| selected_channels = [] | |
| # Filtre par jour de la semaine | |
| if 'day_of_week' in df.columns: | |
| days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
| selected_days = st.sidebar.multiselect( | |
| "📆 Jours de la semaine", | |
| days, | |
| default=days | |
| ) | |
| else: | |
| selected_days = [] | |
| # Application des filtres | |
| filtered_df = df.copy() | |
| if selected_categories and 'Category name' in df.columns: | |
| filtered_df = filtered_df[filtered_df['Category name'].isin(selected_categories)] | |
| if len(date_range) == 2 and 'Published_date' in df.columns: | |
| start_date, end_date = date_range | |
| filtered_df = filtered_df[ | |
| (filtered_df['Published_date'].dt.date >= start_date) & | |
| (filtered_df['Published_date'].dt.date <= end_date) | |
| ] | |
| if selected_channels and 'Channel_name' in df.columns: | |
| filtered_df = filtered_df[filtered_df['Channel_name'].isin(selected_channels)] | |
| if selected_days and 'day_of_week' in df.columns: | |
| filtered_df = filtered_df[filtered_df['day_of_week'].isin(selected_days)] | |
| # Titre principal | |
| st.title("📊 Analyse des tendances YouTube 2025") | |
| st.markdown("*Explorez les données des chaînes YouTube qui ont été en tendance*") | |
| # Métriques principales | |
| st.header("📈 Indicateurs clés") | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| with col1: | |
| st.metric("Nombre de vidéos", f"{len(filtered_df):,}") | |
| with col2: | |
| if 'Views' in filtered_df.columns: | |
| avg_views = filtered_df['Views'].mean() | |
| st.metric("Vues moyennes", f"{avg_views:,.0f}") | |
| with col3: | |
| if 'like_rate' in filtered_df.columns: | |
| avg_like_rate = filtered_df['like_rate'].mean() * 100 | |
| st.metric("Taux de likes moyen", f"{avg_like_rate:.2f}%") | |
| with col4: | |
| if 'duration_minutes' in filtered_df.columns: | |
| avg_duration = filtered_df['duration_minutes'].mean() | |
| st.metric("Durée moyenne", f"{avg_duration:.1f} min") | |
| with col5: | |
| if 'views_per_subscriber' in filtered_df.columns: | |
| avg_vps = filtered_df['views_per_subscriber'].mean() | |
| st.metric("Vues/Abonnés", f"{avg_vps:.3f}") | |
| # Visualisations principales | |
| st.header("🔍 Visualisations") | |
| # Première ligne de graphiques | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if 'Views' in filtered_df.columns: | |
| views_hist = create_histogram(filtered_df, 'Views', 'Distribution des vues', color='#FF0000', nbins=30) | |
| st.plotly_chart(views_hist, use_container_width=True) | |
| with col2: | |
| if 'duration_minutes' in filtered_df.columns: | |
| duration_hist = create_histogram(filtered_df, 'duration_minutes', 'Distribution des durées (minutes)', color='#00BFD6', nbins=25) | |
| st.plotly_chart(duration_hist, use_container_width=True) | |
| # Deuxième ligne de graphiques | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if 'Category name' in filtered_df.columns and 'like_rate' in filtered_df.columns: | |
| category_engagement = filtered_df.groupby('Category name').agg({ | |
| 'like_rate': 'mean', | |
| 'comment_rate': 'mean' if 'comment_rate' in filtered_df.columns else 'count' | |
| }).reset_index() | |
| category_engagement['like_rate'] = category_engagement['like_rate'] * 100 | |
| likes_by_category = create_bar_chart( | |
| category_engagement, | |
| 'Category name', | |
| 'like_rate', | |
| 'Taux de likes par catégorie (%)', | |
| color='#FF9900' | |
| ) | |
| st.plotly_chart(likes_by_category, use_container_width=True) | |
| with col2: | |
| if 'day_of_week' in filtered_df.columns and 'Views' in filtered_df.columns: | |
| daily_stats = filtered_df.groupby('day_of_week')['Views'].mean().reset_index() | |
| # Réorganiser les jours dans l'ordre | |
| day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
| daily_stats['day_of_week'] = pd.Categorical(daily_stats['day_of_week'], categories=day_order, ordered=True) | |
| daily_stats = daily_stats.sort_values('day_of_week') | |
| daily_views = create_bar_chart( | |
| daily_stats, | |
| 'day_of_week', | |
| 'Views', | |
| 'Vues moyennes par jour de la semaine', | |
| color='#00CC96' | |
| ) | |
| st.plotly_chart(daily_views, use_container_width=True) | |
| # Graphique de dispersion | |
| if all(col in filtered_df.columns for col in ['Views', 'like_rate', 'Channel_subscribers']): | |
| scatter_plot = create_scatter_plot( | |
| filtered_df, | |
| 'Views', | |
| 'like_rate', | |
| 'Channel_subscribers', | |
| ['Title', 'Channel_name', 'Category name'], | |
| 'Relation entre vues et taux de likes (taille = nb d\'abonnés)', | |
| 'Category name' | |
| ) | |
| st.plotly_chart(scatter_plot, use_container_width=True) | |
| # Analyse temporelle | |
| if 'Published_date' in filtered_df.columns and 'Views' in filtered_df.columns: | |
| st.header("📅 Analyse temporelle") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| time_series = create_time_series(filtered_df, 'Published_date', 'Views', 'Évolution des vues moyennes dans le temps') | |
| st.plotly_chart(time_series, use_container_width=True) | |
| with col2: | |
| if 'hour_of_publication' in filtered_df.columns: | |
| hourly_stats = filtered_df.groupby('hour_of_publication')['Views'].mean().reset_index() | |
| hourly_chart = create_bar_chart( | |
| hourly_stats, | |
| 'hour_of_publication', | |
| 'Views', | |
| 'Vues moyennes par heure de publication', | |
| color='#9467bd' | |
| ) | |
| st.plotly_chart(hourly_chart, use_container_width=True) | |
| # Top vidéos | |
| st.header("🏆 Top 10 des vidéos les plus vues") | |
| if 'Views' in filtered_df.columns: | |
| top_videos = filtered_df.nlargest(10, 'Views')[ | |
| ['Title', 'Channel_name', 'Category name', 'Views', 'Likes', 'Comments', 'duration_minutes'] | |
| ].copy() | |
| # Renommer les colonnes pour l'affichage | |
| top_videos = top_videos.rename(columns={ | |
| 'Title': 'Titre', | |
| 'Channel_name': 'Chaîne', | |
| 'Category name': 'Catégorie', | |
| 'Views': 'Vues', | |
| 'Likes': 'Likes', | |
| 'Comments': 'Commentaires', | |
| 'duration_minutes': 'Durée (min)' | |
| }) | |
| st.dataframe(top_videos, use_container_width=True) | |
| # Statistiques par chaîne | |
| st.header("📺 Top chaînes") | |
| if all(col in filtered_df.columns for col in ['Channel_name', 'Views', 'Channel_subscribers']): | |
| channel_stats = filtered_df.groupby('Channel_name').agg({ | |
| 'Views': ['count', 'mean', 'sum'], | |
| 'Channel_subscribers': 'first', | |
| 'like_rate': 'mean', | |
| 'views_per_subscriber': 'mean' | |
| }).round(2) | |
| # Aplatir les colonnes multi-index | |
| channel_stats.columns = ['Nb_vidéos', 'Vues_moyennes', 'Vues_totales', 'Abonnés', 'Taux_likes_moyen', 'Vues_par_abonné'] | |
| channel_stats = channel_stats.reset_index() | |
| channel_stats = channel_stats.sort_values('Vues_totales', ascending=False).head(10) | |
| st.dataframe(channel_stats, use_container_width=True) | |
| # Export des données | |
| st.header("📥 Télécharger les données filtrées") | |
| csv = filtered_df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label="Télécharger en CSV", | |
| data=csv, | |
| file_name="youtube_trends_filtered_2025.csv", | |
| mime="text/csv", | |
| ) | |
| # Informations sur l'application | |
| st.sidebar.markdown("---") | |
| st.sidebar.info(""" | |
| **À propos de cette application** | |
| Cette application analyse les tendances YouTube de 2025 basées sur les chaînes qui ont été en tendance. | |
| **Fonctionnalités :** | |
| - Filtrage par catégorie, date, chaîne et jour | |
| - Indicateurs clés de performance | |
| - Visualisations interactives | |
| - Analyse temporelle | |
| - Export des données | |
| Créé avec Streamlit et Plotly. | |
| """) | |
| # Statistiques globales en bas de page | |
| with st.expander("📊 Statistiques globales du dataset"): | |
| st.write(f"**Nombre total de vidéos :** {len(df):,}") | |
| st.write(f"**Nombre de chaînes uniques :** {df['Channel_name'].nunique() if 'Channel_name' in df.columns else 'N/A'}") | |
| st.write(f"**Nombre de catégories :** {df['Category name'].nunique() if 'Category name' in df.columns else 'N/A'}") | |
| st.write(f"**Période couverte :** {df['Published_date'].min().strftime('%d/%m/%Y') if 'Published_date' in df.columns else 'N/A'} - {df['Published_date'].max().strftime('%d/%m/%Y') if 'Published_date' in df.columns else 'N/A'}") | |