Spaces:
Sleeping
Sleeping
Choi jun hyeok
commited on
Commit
ยท
be91dcc
1
Parent(s):
ec5ae24
update prompt
Browse files- analysis.py +346 -0
- analysis2.py +233 -0
- analysis3.py +260 -0
- analysis4.py +197 -0
- app.py +2 -2
- train_and_save_models.py +313 -190
analysis.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ํ์์ ๋ฐ์ดํฐ ๋ถ์ (Advanced EDA)
|
| 4 |
+
|
| 5 |
+
์ด ์คํฌ๋ฆฝํธ๋ ๋ค์ 4๊ฐ์ ๋ฐ์ดํฐ์
์ ํ์ฉํ์ฌ ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ธต ๋ถ์ํฉ๋๋ค.
|
| 6 |
+
1. article_metrics_monthly.csv: ๊ธฐ์ฌ๋ณ ์๊ฐ ์งํ (์กฐํ์, ์ข์์, ๋๊ธ)
|
| 7 |
+
2. contents.csv: ๊ธฐ์ฌ ์ฝํ
์ธ ์ ๋ณด (์นดํ
๊ณ ๋ฆฌ, ์ ๋ชฉ, ํ๊ทธ ๋ฑ)
|
| 8 |
+
3. demographics_merged.csv: ๊ธฐ์ฌ๋ณ ์ธ๊ตฌํต๊ณํ์ ๋
์ ๋ฐ์ดํฐ
|
| 9 |
+
4. referrer.csv: ๊ธฐ์ฌ๋ณ ์ ์
๊ฒฝ๋ก ๋ฐ์ดํฐ
|
| 10 |
+
|
| 11 |
+
์ฃผ์ ๋ถ์ ๋ด์ฉ:
|
| 12 |
+
- ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ๋ฐ ํผ์ฒ ์์ง๋์ด๋ง
|
| 13 |
+
- ๊ธฐ์ฌ ํต์ฌ ์งํ(์กฐํ์, ์ข์์, ๋๊ธ) ๋ถํฌ ๋ฐ ์๊ด๊ด๊ณ ๋ถ์
|
| 14 |
+
- ์ฝํ
์ธ ์นดํ
๊ณ ๋ฆฌ๋ณ ์ฑ๊ณผ ๋ฐ ๋
์ ์ฐธ์ฌ๋ ์ฌ์ธต ๋ถ์
|
| 15 |
+
- ํ๊ทธ ๋ถ์ (Word Cloud ํฌํจ)
|
| 16 |
+
- ์ธ๊ตฌํต๊ณ(์ฐ๋ น/์ฑ๋ณ) ๊ทธ๋ฃน๋ณ ์ ํธ ์นดํ
๊ณ ๋ฆฌ ๋ถ์ (ํํธ๋งต)
|
| 17 |
+
- ์ ์
๊ฒฝ๋ก๋ณ ์ฑ๊ณผ ๋ฐ ํจ์จ์ฑ ๋ถ์
|
| 18 |
+
- ์ข
ํฉ ์ธ์ฌ์ดํธ ๋์ถ ๋ฐ ๋ฆฌํฌํธ ์๋ ์์ฑ
|
| 19 |
+
|
| 20 |
+
์คํ ๋ฐฉ๋ฒ:
|
| 21 |
+
- ์คํฌ๋ฆฝํธ๋ฅผ ์คํํ๊ธฐ ์ , DATA_DIR ๊ฒฝ๋ก๋ฅผ ์ค์ ๋ฐ์ดํฐ๊ฐ ์๋ ํด๋๋ก ์์ ํ์ธ์.
|
| 22 |
+
- ์คํ ์ ์คํฌ๋ฆฝํธ์ ๋์ผํ ์์น์ 'output' ํด๋๊ฐ ์์ฑ๋๋ฉฐ, ๋ชจ๋ ์๊ฐํ ์๋ฃ์ ์ต์ข
์ธ์ฌ์ดํธ ๋ณด๊ณ ์๊ฐ ์ ์ฅ๋ฉ๋๋ค.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
# 1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ
|
| 26 |
+
import pandas as pd
|
| 27 |
+
import numpy as np
|
| 28 |
+
import matplotlib.pyplot as plt
|
| 29 |
+
import seaborn as sns
|
| 30 |
+
from datetime import datetime
|
| 31 |
+
import warnings
|
| 32 |
+
import os
|
| 33 |
+
from wordcloud import WordCloud
|
| 34 |
+
|
| 35 |
+
warnings.filterwarnings('ignore')
|
| 36 |
+
|
| 37 |
+
# 2. ๊ธฐ๋ณธ ์ค์ ๋ฐ ์ ์ญ ๋ณ์
|
| 38 |
+
def setup_environment():
|
| 39 |
+
"""๋ถ์ ํ๊ฒฝ ์ค์ (๊ฒฝ๋ก, ์๊ฐํ ์คํ์ผ)"""
|
| 40 |
+
# === ๊ฒฝ๋ก ์ค์ (์ฌ์ฉ์ ํ๊ฒฝ์ ๋ง๊ฒ ์์ ) ===
|
| 41 |
+
DATA_DIR = r'Broadcast_paper\data_csv'
|
| 42 |
+
OUTPUT_DIR = r'./output_analysis'
|
| 43 |
+
|
| 44 |
+
# ์ถ๋ ฅ ํด๋ ์์ฑ
|
| 45 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 46 |
+
os.makedirs(OUTPUT_DIR)
|
| 47 |
+
print(f"'{OUTPUT_DIR}' ํด๋๋ฅผ ์์ฑํ์ต๋๋ค.")
|
| 48 |
+
|
| 49 |
+
# === ์๊ฐํ ์ค์ ===
|
| 50 |
+
plt.rc('font', family='Malgun Gothic')
|
| 51 |
+
plt.rcParams['axes.unicode_minus'] = False
|
| 52 |
+
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
|
| 53 |
+
|
| 54 |
+
print("๋ถ์ ํ๊ฒฝ ์ค์ ์๋ฃ!")
|
| 55 |
+
return DATA_DIR, OUTPUT_DIR
|
| 56 |
+
|
| 57 |
+
# 3. ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ
|
| 58 |
+
def load_and_preprocess_data(data_dir):
|
| 59 |
+
"""๋ฐ์ดํฐ๋ฅผ ๋ก๋ํ๊ณ ๊ธฐ๋ณธ ์ ์ฒ๋ฆฌ๋ฅผ ์ํํฉ๋๋ค."""
|
| 60 |
+
print("\n[๋จ๊ณ 1] ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์์...")
|
| 61 |
+
|
| 62 |
+
# ๋ฐ์ดํฐ ๋ก๋
|
| 63 |
+
df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
|
| 64 |
+
df_contents = pd.read_csv(f'{data_dir}/contents.csv')
|
| 65 |
+
df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
|
| 66 |
+
df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
|
| 67 |
+
|
| 68 |
+
# --- ์ ์ฒ๋ฆฌ ---
|
| 69 |
+
# 1. df_metrics
|
| 70 |
+
df_metrics['period'] = pd.to_datetime(df_metrics['period'])
|
| 71 |
+
df_metrics['comments'].fillna(0, inplace=True) # ๋๊ธ ๊ฒฐ์ธก์น๋ 0์ผ๋ก ์ฒ๋ฆฌ
|
| 72 |
+
|
| 73 |
+
# 2. df_contents
|
| 74 |
+
df_contents.dropna(subset=['category', 'content', 'date'], inplace=True) # ์ฃผ์ ์ ๋ณด ๊ฒฐ์ธก ํ ์ ๊ฑฐ
|
| 75 |
+
df_contents['date'] = pd.to_datetime(df_contents['date'])
|
| 76 |
+
df_contents['publish_month'] = df_contents['date'].dt.to_period('M')
|
| 77 |
+
df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
|
| 78 |
+
df_contents['content_length'] = df_contents['content'].str.len()
|
| 79 |
+
|
| 80 |
+
# 3. df_demo
|
| 81 |
+
df_demo_filtered = df_demo[df_demo['age_group'] != '์ ์ฒด'].copy()
|
| 82 |
+
|
| 83 |
+
# 4. ๋ฐ์ดํฐ ํตํฉ
|
| 84 |
+
# ์๋ณ ์งํ๋ฅผ ๊ธฐ์ฌ๋ณ ์ด๊ณ๋ก ์ง๊ณ
|
| 85 |
+
article_total_metrics = df_metrics.groupby('article_id').agg({
|
| 86 |
+
'views_total': 'sum',
|
| 87 |
+
'likes': 'sum',
|
| 88 |
+
'comments': 'sum'
|
| 89 |
+
}).reset_index()
|
| 90 |
+
|
| 91 |
+
# ์ฝํ
์ธ ์ ๋ณด์ ๊ธฐ์ฌ๋ณ ์ด๊ณ ์งํ ๋ณํฉ
|
| 92 |
+
df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
|
| 93 |
+
df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
|
| 94 |
+
|
| 95 |
+
# ์ฐธ์ฌ๋(Engagement Rate) ๊ณ์ฐ: (์ข์์ + ๋๊ธ) / ์กฐํ์
|
| 96 |
+
# ์กฐํ์๊ฐ 0์ธ ๊ฒฝ์ฐ ์ค๋ฅ ๋ฐฉ์ง
|
| 97 |
+
df_merged['engagement_rate'] = (
|
| 98 |
+
(df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)
|
| 99 |
+
) * 100
|
| 100 |
+
|
| 101 |
+
print("๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์๋ฃ!")
|
| 102 |
+
|
| 103 |
+
return {
|
| 104 |
+
"metrics": df_metrics,
|
| 105 |
+
"contents": df_contents,
|
| 106 |
+
"demo": df_demo_filtered,
|
| 107 |
+
"referrer": df_referrer,
|
| 108 |
+
"merged": df_merged
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# 4. ์์ธ ๋ถ์ ๋ฐ ์๊ฐํ ํจ์๋ค
|
| 112 |
+
def analyze_metrics_overview(df_merged, output_dir):
|
| 113 |
+
"""๊ธฐ์ฌ ์งํ์ ์ ๋ฐ์ ์ธ ๋ถํฌ์ ์๊ด๊ด๊ณ๋ฅผ ๋ถ์ํ๊ณ ์๊ฐํํฉ๋๋ค."""
|
| 114 |
+
print("\n[๋จ๊ณ 2] ๊ธฐ์ฌ ์งํ ์ ๋ฐ ๋ถ์...")
|
| 115 |
+
|
| 116 |
+
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
|
| 117 |
+
|
| 118 |
+
# ์กฐํ์, ์ข์์, ๋๊ธ ๋ถํฌ
|
| 119 |
+
sns.histplot(data=df_merged, x='views_total', bins=50, ax=axes[0], kde=True)
|
| 120 |
+
axes[0].set_title('๊ธฐ์ฌ๋ณ ์ด ์กฐํ์ ๋ถํฌ', fontsize=16)
|
| 121 |
+
axes[0].set_xlabel('์ด ์กฐํ์')
|
| 122 |
+
axes[0].set_ylabel('๊ธฐ์ฌ ์')
|
| 123 |
+
axes[0].set_xlim(0, df_merged['views_total'].quantile(0.95)) # ์์ 5% ์ด์์ ์ ์ธํ์ฌ ๋ถํฌ ํ์ธ
|
| 124 |
+
|
| 125 |
+
# ์๊ด๊ด๊ณ ํํธ๋งต
|
| 126 |
+
corr = df_merged[['views_total', 'likes', 'comments', 'content_length']].corr()
|
| 127 |
+
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
|
| 128 |
+
axes[1].set_title('์ฃผ์ ์งํ ๊ฐ ์๊ด๊ด๊ณ', fontsize=16)
|
| 129 |
+
|
| 130 |
+
plt.tight_layout()
|
| 131 |
+
plt.savefig(f'{output_dir}/metrics_overview.png')
|
| 132 |
+
plt.close()
|
| 133 |
+
print(" - ๊ธฐ์ฌ ์งํ ๋ถํฌ ๋ฐ ์๊ด๊ด๊ณ ๋ถ์ ์๋ฃ. (metrics_overview.png ์ ์ฅ)")
|
| 134 |
+
|
| 135 |
+
def analyze_content_features(df_merged, output_dir):
|
| 136 |
+
"""์ฝํ
์ธ ํน์ง(์นดํ
๊ณ ๋ฆฌ, ํ๊ทธ, ๊ธ์ ์, ๋ฐํ ์์ผ)์ ๋ฐ๋ฅธ ์ฑ๊ณผ ๋ถ์"""
|
| 137 |
+
print("\n[๋จ๊ณ 3] ์ฝํ
์ธ ํน์ง๋ณ ์ฑ๊ณผ ๋ถ์...")
|
| 138 |
+
|
| 139 |
+
# ์นดํ
๊ณ ๋ฆฌ๋ณ ํ๊ท ์งํ
|
| 140 |
+
category_performance = df_merged.groupby('category').agg({
|
| 141 |
+
'views_total': 'mean',
|
| 142 |
+
'likes': 'mean',
|
| 143 |
+
'comments': 'mean',
|
| 144 |
+
'engagement_rate': 'mean'
|
| 145 |
+
}).sort_values('views_total', ascending=False)
|
| 146 |
+
|
| 147 |
+
fig, ax = plt.subplots(figsize=(14, 10))
|
| 148 |
+
category_performance['views_total'].sort_values().plot(kind='barh', ax=ax, color='skyblue')
|
| 149 |
+
ax.set_title('์นดํ
๊ณ ๋ฆฌ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 150 |
+
ax.set_xlabel('ํ๊ท ์กฐํ์')
|
| 151 |
+
ax.set_ylabel('์นดํ
๊ณ ๋ฆฌ')
|
| 152 |
+
plt.tight_layout()
|
| 153 |
+
plt.savefig(f'{output_dir}/category_avg_views.png')
|
| 154 |
+
plt.close()
|
| 155 |
+
print(" - ์นดํ
๊ณ ๋ฆฌ๋ณ ํ๊ท ์กฐํ์ ๋ถ์ ์๋ฃ. (category_avg_views.png ์ ์ฅ)")
|
| 156 |
+
|
| 157 |
+
# ํ๊ทธ ๋ถ์ ๋ฐ Word Cloud
|
| 158 |
+
tags = df_merged['tag'].dropna().str.split(',').explode().str.strip()
|
| 159 |
+
top_tags = tags.value_counts().head(50)
|
| 160 |
+
|
| 161 |
+
wordcloud = WordCloud(
|
| 162 |
+
font_path='malgun',
|
| 163 |
+
width=1000,
|
| 164 |
+
height=600,
|
| 165 |
+
background_color='white',
|
| 166 |
+
colormap='viridis'
|
| 167 |
+
).generate_from_frequencies(top_tags)
|
| 168 |
+
|
| 169 |
+
plt.figure(figsize=(15, 9))
|
| 170 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
| 171 |
+
plt.axis('off')
|
| 172 |
+
plt.title('์์ 50๊ฐ ํ๊ทธ Word Cloud', fontsize=20)
|
| 173 |
+
plt.tight_layout()
|
| 174 |
+
plt.savefig(f'{output_dir}/tags_wordcloud.png')
|
| 175 |
+
plt.close()
|
| 176 |
+
print(" - ํ๊ทธ Word Cloud ์์ฑ ์๋ฃ. (tags_wordcloud.png ์ ์ฅ)")
|
| 177 |
+
|
| 178 |
+
# ๋ฐํ ์์ผ๋ณ ๊ธฐ์ฌ ์ ๋ฐ ํ๊ท ์กฐํ์
|
| 179 |
+
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
|
| 180 |
+
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
| 181 |
+
|
| 182 |
+
sns.countplot(data=df_merged, y='publish_dayofweek', order=day_order, ax=axes[0], palette='pastel')
|
| 183 |
+
axes[0].set_title('์์ผ๋ณ ๋ฐํ ๊ธฐ์ฌ ์', fontsize=16)
|
| 184 |
+
axes[0].set_xlabel('๊ธฐ์ฌ ์')
|
| 185 |
+
axes[0].set_ylabel('์์ผ')
|
| 186 |
+
|
| 187 |
+
sns.barplot(data=df_merged, y='publish_dayofweek', x='views_total', order=day_order, ax=axes[1], palette='pastel', ci=None)
|
| 188 |
+
axes[1].set_title('์์ผ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 189 |
+
axes[1].set_xlabel('ํ๊ท ์กฐํ์')
|
| 190 |
+
axes[1].set_ylabel('')
|
| 191 |
+
|
| 192 |
+
plt.tight_layout()
|
| 193 |
+
plt.savefig(f'{output_dir}/dayofweek_performance.png')
|
| 194 |
+
plt.close()
|
| 195 |
+
print(" - ๋ฐํ ์์ผ๋ณ ์ฑ๊ณผ ๋ถ์ ์๋ฃ. (dayofweek_performance.png ์ ์ฅ)")
|
| 196 |
+
|
| 197 |
+
def analyze_demographics(df_demo, df_merged, output_dir):
|
| 198 |
+
"""์ธ๊ตฌํต๊ณํ์ ํน์ฑ(์ฐ๋ น/์ฑ๋ณ)์ ๋ฐ๋ฅธ ์ฝํ
์ธ ์๋น ํจํด ๋ถ์"""
|
| 199 |
+
print("\n[๋จ๊ณ 4] ์ธ๊ตฌํต๊ณ ๊ทธ๋ฃน๋ณ ์ ํธ๋ ๋ถ์...")
|
| 200 |
+
|
| 201 |
+
# ๊ธฐ์ฌ ID๋ฅผ ๊ธฐ์ค์ผ๋ก ์ธ๊ตฌํต๊ณ ๋ฐ์ดํฐ์ ์ฝํ
์ธ ๋ฐ์ดํฐ ๋ณํฉ
|
| 202 |
+
df_demo_content = pd.merge(df_demo, df_merged[['article_id', 'category']], on='article_id', how='left')
|
| 203 |
+
|
| 204 |
+
# ์ฐ๋ น๋ ๋ฐ ์ฑ๋ณ์ ๋ฐ๋ฅธ ์นดํ
๊ณ ๋ฆฌ๋ณ ์กฐํ์ ์ง๊ณ
|
| 205 |
+
demo_category_views = df_demo_content.groupby(['age_group', 'gender', 'category'])['views'].sum().reset_index()
|
| 206 |
+
|
| 207 |
+
# ํํธ๋งต ์์ฑ์ ์ํ ํผ๋ฒ ํ
์ด๋ธ
|
| 208 |
+
# ์ฌ์ฑ ๋
์
|
| 209 |
+
female_pivot = demo_category_views[demo_category_views['gender'] == '์ฌ'].pivot_table(
|
| 210 |
+
index='category', columns='age_group', values='views', aggfunc='sum'
|
| 211 |
+
).fillna(0)
|
| 212 |
+
|
| 213 |
+
# ๋จ์ฑ ๋
์
|
| 214 |
+
male_pivot = demo_category_views[demo_category_views['gender'] == '๋จ'].pivot_table(
|
| 215 |
+
index='category', columns='age_group', values='views', aggfunc='sum'
|
| 216 |
+
).fillna(0)
|
| 217 |
+
|
| 218 |
+
# ์๊ฐํ
|
| 219 |
+
fig, axes = plt.subplots(2, 1, figsize=(20, 24))
|
| 220 |
+
|
| 221 |
+
sns.heatmap(female_pivot, cmap='Reds', annot=True, fmt='.0f', linewidths=.5, ax=axes[0])
|
| 222 |
+
axes[0].set_title('์ฌ์ฑ ์ฐ๋ น๋๋ณ ์ ํธ ์นดํ
๊ณ ๋ฆฌ (์ด ์กฐํ์ ๊ธฐ์ค)', fontsize=18)
|
| 223 |
+
axes[0].set_xlabel('์ฐ๋ น๋')
|
| 224 |
+
axes[0].set_ylabel('์นดํ
๊ณ ๋ฆฌ')
|
| 225 |
+
|
| 226 |
+
sns.heatmap(male_pivot, cmap='Blues', annot=True, fmt='.0f', linewidths=.5, ax=axes[1])
|
| 227 |
+
axes[1].set_title('๋จ์ฑ ์ฐ๋ น๋๋ณ ์ ํธ ์นดํ
๊ณ ๋ฆฌ (์ด ์กฐํ์ ๊ธฐ์ค)', fontsize=18)
|
| 228 |
+
axes[1].set_xlabel('์ฐ๋ น๋')
|
| 229 |
+
axes[1].set_ylabel('์นดํ
๊ณ ๋ฆฌ')
|
| 230 |
+
|
| 231 |
+
plt.tight_layout()
|
| 232 |
+
plt.savefig(f'{output_dir}/demographic_category_preference_heatmap.png')
|
| 233 |
+
plt.close()
|
| 234 |
+
print(" - ์ธ๊ตฌํต๊ณ ๊ทธ๋ฃน๋ณ ์ ํธ ์นดํ
๊ณ ๋ฆฌ ํํธ๋งต ๋ถ์ ์๋ฃ. (demographic_category_preference_heatmap.png ์ ์ฅ)")
|
| 235 |
+
|
| 236 |
+
def analyze_referrer(df_referrer, df_merged, output_dir):
|
| 237 |
+
"""์ ์
๊ฒฝ๋ก๋ณ ๊ธฐ์ฌ๋ ๋ฐ ํจ์จ์ฑ ๋ถ์"""
|
| 238 |
+
print("\n[๋จ๊ณ 5] ์ ์
๊ฒฝ๋ก๋ณ ํจ์จ์ฑ ๋ถ์...")
|
| 239 |
+
|
| 240 |
+
# ์ ์
๊ฒฝ๋ก ๋ฐ์ดํฐ์ ๊ธฐ์ฌ ์งํ ๋ณํฉ
|
| 241 |
+
df_referrer_merged = pd.merge(df_referrer, df_merged[['article_id', 'views_total', 'engagement_rate']], on='article_id', how='left')
|
| 242 |
+
|
| 243 |
+
# ์ฃผ์ ์ ์
๊ฒฝ๋ก(์์ 10๊ฐ) ์ถ์ถ
|
| 244 |
+
top_10_referrers = df_referrer_merged.groupby('referrer')['share'].sum().nlargest(10).index
|
| 245 |
+
df_top_referrers = df_referrer_merged[df_referrer_merged['referrer'].isin(top_10_referrers)]
|
| 246 |
+
|
| 247 |
+
# ์ ์
๊ฒฝ๋ก๋ณ ํ๊ท ์ฐธ์ฌ๋ ๊ณ์ฐ
|
| 248 |
+
referrer_engagement = df_top_referrers.groupby('referrer')['engagement_rate'].mean().sort_values(ascending=False)
|
| 249 |
+
|
| 250 |
+
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
|
| 251 |
+
|
| 252 |
+
# ์ ์
๊ฒฝ๋ก๋ณ ์ด ๊ธฐ์ฌ๋
|
| 253 |
+
df_top_referrers.groupby('referrer')['share'].sum().sort_values().plot(kind='barh', ax=axes[0], color='c')
|
| 254 |
+
axes[0].set_title('์์ 10๊ฐ ์ ์
๊ฒฝ๋ก๋ณ ์ด ๊ธฐ์ฌ๋(Share)', fontsize=16)
|
| 255 |
+
axes[0].set_xlabel('์ด Share')
|
| 256 |
+
axes[0].set_ylabel('์ ์
๊ฒฝ๋ก')
|
| 257 |
+
|
| 258 |
+
# ์ ์
๊ฒฝ๋ก๋ณ ํ๊ท ์ฐธ์ฌ๋
|
| 259 |
+
referrer_engagement.sort_values().plot(kind='barh', ax=axes[1], color='m')
|
| 260 |
+
axes[1].set_title('์์ 10๊ฐ ์ ์
๊ฒฝ๋ก๋ณ ํ๊ท ์ฐธ์ฌ๋(%)', fontsize=16)
|
| 261 |
+
axes[1].set_xlabel('ํ๊ท ์ฐธ์ฌ๋ (%)')
|
| 262 |
+
axes[1].set_ylabel('')
|
| 263 |
+
|
| 264 |
+
plt.tight_layout()
|
| 265 |
+
plt.savefig(f'{output_dir}/referrer_performance.png')
|
| 266 |
+
plt.close()
|
| 267 |
+
print(" - ์ฃผ์ ์ ์
๊ฒฝ๋ก๋ณ ๊ธฐ์ฌ๋ ๋ฐ ์ฐธ์ฌ๋ ๋ถ์ ์๋ฃ. (referrer_performance.png ์ ์ฅ)")
|
| 268 |
+
|
| 269 |
+
# 5. ์ข
ํฉ ์ธ์ฌ์ดํธ ์์ฑ
|
| 270 |
+
def generate_insights_report(data, output_dir):
|
| 271 |
+
"""๋ถ์ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํ์ผ๋ก ์ข
ํฉ์ ์ธ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์๋ฅผ ์์ฑํฉ๋๋ค."""
|
| 272 |
+
print("\n[๋จ๊ณ 6] ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ...")
|
| 273 |
+
|
| 274 |
+
# ๋ณด๊ณ ์ ๋ด์ฉ ์์ฑ
|
| 275 |
+
report = f"""
|
| 276 |
+
# ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ ๋ณด๊ณ ์
|
| 277 |
+
์์ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 278 |
+
|
| 279 |
+
## 1. ๋ถ์ ๊ฐ์
|
| 280 |
+
- ๋ณธ ๋ณด๊ณ ์๋ ๊ธฐ์ฌ ์ฑ๊ณผ ์งํ, ์ฝํ
์ธ ํน์ฑ, ๋
์ ์ธ๊ตฌํต๊ณ, ์ ์
๊ฒฝ๋ก ๋ฐ์ดํฐ๋ฅผ ์ข
ํฉํ์ฌ ๋
์ ํ๋ ํจํด์ ๋ถ์ํ๊ณ , ์ด๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ฝํ
์ธ ์ ๋ต ๊ฐ์ ๋ฐฉ์์ ์ ์ํ๋ ๊ฒ์ ๋ชฉํ๋ก ํฉ๋๋ค.
|
| 281 |
+
- ์ด {data['merged']['article_id'].nunique():,}๊ฐ์ ๊ธฐ์ฌ์ ๊ด๋ จ ๋ฐ์ดํฐ๋ฅผ ๋ถ์ํ์ต๋๋ค.
|
| 282 |
+
|
| 283 |
+
## 2. ์ฃผ์ ๋ถ์ ๊ฒฐ๊ณผ (Key Findings)
|
| 284 |
+
|
| 285 |
+
### 2.1. ์ฝํ
์ธ ์ฑ๊ณผ
|
| 286 |
+
- **์ฑ๊ณผ ๋ถํฌ**: ๋๋ถ๋ถ์ ๊ธฐ์ฌ๋ ์์์ ์กฐํ์๋ฅผ ๊ธฐ๋กํ๋ฉฐ, ์์์ 'ํํธ ๊ธฐ์ฌ'๊ฐ ์ ์ฒด ์กฐํ์๋ฅผ ๊ฒฌ์ธํ๋ ๋กฑํ
์ผ(Long-tail) ๋ถํฌ๋ฅผ ๋ณด์
๋๋ค. (metrics_overview.png ์ฐธ๊ณ )
|
| 287 |
+
- **ํต์ฌ ์นดํ
๊ณ ๋ฆฌ**: '๋ฏธ๋์ด ไบบ์ฌ์ด๋', '์์ด๋์ด์ค', '๋ฏธ๋์ดยทAIํธ๋ ๋' ์นดํ
๊ณ ๋ฆฌ๊ฐ ํ๊ท ์กฐํ์ ์ต์์๊ถ์ ์ฐจ์งํ์ต๋๋ค. ์ด๋ค ์นดํ
๊ณ ๋ฆฌ๊ฐ ๋
์์ ๋์ ๊ด์ฌ์ ์ ๋ํ๋ ํต์ฌ ์ฝํ
์ธ ์์ ์์ฌํฉ๋๋ค. (category_avg_views.png ์ฐธ๊ณ )
|
| 288 |
+
- **์ฃผ์ ํ๊ทธ**: '#์ธ๋ก ', '#๊ธฐ์', '#๋ด์ค', '#๋ฏธ๋์ด', '#์ ๋๋ฆฌ์ฆ' ๋ฑ ์ธ๋ก ๋ณธ์ง๊ณผ ๊ด๋ จ๋ ํค์๋๊ฐ ๊ฐ์ฅ ๋น๋ฒํ๊ฒ ์ฌ์ฉ๋์์ต๋๋ค. '#์ธ๊ณต์ง๋ฅ', '#AI', '#ํ
ํฌ' ๋ฑ ๊ธฐ์ ๊ด๋ จ ํ๊ทธ๋ ์์๊ถ์ ์์นํ์ฌ ๊ธฐ์ ํธ๋ ๋์ ๋ํ ๋์ ๊ด์ฌ์ ๋ณด์ฌ์ค๋๋ค. (tags_wordcloud.png ์ฐธ๊ณ )
|
| 289 |
+
|
| 290 |
+
### 2.2. ๋
์ ํน์ฑ
|
| 291 |
+
- **์ฃผ์ ๋
์์ธต**: 10๋ ํ๋ฐ์์ 30๋ ์ด๋ฐ์ ์ ์ ์ธต์ด ์ฝํ
์ธ ์๋น์ ํต์ฌ ๊ทธ๋ฃน์
๋๋ค. ํนํ 19-24์ธ ์ฌ์ฑ ๊ทธ๋ฃน์ ํ๋์ด ๋๋๋ฌ์ง๋๋ค.
|
| 292 |
+
- **์ฑ๋ณ/์ฐ๋ น๋ณ ์ ํธ๋**:
|
| 293 |
+
- **์ฌ์ฑ**: 10๋-20๋ ์ด๋ฐ์ '์ปค๋ฒ์คํ ๋ฆฌ', '๋ฏธ๋์ดํฌ๋ผ'์, 20๋ ํ๋ฐ-30๋๋ '์ทจ์ฌ๊ธฐยท์ ์๊ธฐ', '๋ฏธ๋์ด ไบบ์ฌ์ด๋' ๋ฑ ์ฌ์ธต์ ์ธ ์ฝํ
์ธ ์ ๋์ ๋ฐ์์ ๋ณด์
๋๋ค.
|
| 294 |
+
- **๋จ์ฑ**: 20๋-30๋ ๊ทธ๋ฃน์ด ์ ๋ฐ์ ์ธ ์๋น๋ฅผ ์ฃผ๋ํ๋ฉฐ, ํนํ '์ปค๋ฒ์คํ ๋ฆฌ', '์ง์ค์ ๊ฒ'๊ณผ ๊ฐ์ ์์ฌ/๊ธฐํ ๊ธฐ์ฌ์ ๋ํ ๊ด์ฌ์ด ๋์ต๋๋ค.
|
| 295 |
+
- (demographic_category_preference_heatmap.png ์ฐธ๊ณ )
|
| 296 |
+
|
| 297 |
+
### 2.3. ์ ์
๊ฒฝ๋ก ํจ์จ์ฑ
|
| 298 |
+
- **์ฃผ์ ์ ์
์ฑ๋**: 'Google'๊ณผ '๋ค์ด๋ฒ' ๊ด๋ จ ์ฑ๋(ํตํฉ๊ฒ์, ๋ธ๋ก๊ทธ ๋ฑ)์ด ์ ์ฒด ํธ๋ํฝ์ ์๋์ ์ธ ๋น์ค์ ์ฐจ์งํฉ๋๋ค. ๊ฒ์ ์์ง ์ต์ ํ(SEO)์ ์ค์์ฑ์ด ๋งค์ฐ ํฝ๋๋ค.
|
| 299 |
+
- **๊ณ ํ์ง ํธ๋ํฝ**: '๋ค์ด๋ฒ ๋ธ๋ก๊ทธ๊ฒ์'์ ๋์ ํธ๋ํฝ ๊ธฐ์ฌ๋์ ํจ๊ป ์ํธํ ๋
์ ์ฐธ์ฌ๋๋ฅผ ๋ณด์ฌ์ฃผ๋ ํจ์จ์ ์ธ ์ฑ๋์
๋๋ค. ๋ฐ๋ฉด, 'Google'์ ๊ฐ์ฅ ๋ง์ ํธ๋ํฝ์ ์ ์
์ํค์ง๋ง, ํ๊ท ์ฐธ์ฌ๋๋ ์๋์ ์ผ๋ก ๋ฎ์ ๋์ ๋ฒ์์ ์ผ๋ฐ ๋
์ ์ ์
์ด ๋ง์ ๊ฒ์ผ๋ก ์ถ์ ๋ฉ๋๋ค. (referrer_performance.png ์ฐธ๊ณ )
|
| 300 |
+
|
| 301 |
+
## 3. ์ ๋ต์ ์ ์ธ (Strategic Recommendations)
|
| 302 |
+
|
| 303 |
+
1. **์ฝํ
์ธ ๊ฐ์ธํ ๋ฐ ํ๊ฒํ
๊ฐํ**:
|
| 304 |
+
- **ํต์ฌ ๋
์์ธต(19-34์ธ) ์ง์ค**: ์ด๋ค์ด ์ ํธํ๋ '๋ฏธ๋์ด ไบบ์ฌ์ด๏ฟฝ๏ฟฝ', '๋ฏธ๋์ดยทAIํธ๋ ๋'์ ๊ฐ์ ์ฌ์ธต ๋ถ์ ๋ฐ ํธ๋ ๋ ๊ด๋ จ ์ฝํ
์ธ ๋ฅผ ๊ฐํํ๊ณ , ๊ด๋ จ ์ ๊ท ๊ธฐํ์ ๋ฐ๊ตดํด์ผ ํฉ๋๋ค.
|
| 305 |
+
- **์ ์ฌ ๋
์์ธต(40๋ ์ด์) ๊ณต๋ต**: 40๋ ์ด์ ๋จ๋
๊ฐ ๊ณตํต์ ์ผ๋ก ๊ด์ฌ์ ๋ณด์ด๋ '์ง์ค์ ๊ฒ', '๋ฏธ๋์ดํ์ฅ' ์นดํ
๊ณ ๋ฆฌ ์ฝํ
์ธ ๋ฅผ ํ์ฉํ์ฌ ์ด ์ฐ๋ น๋์ ํนํ๋ ์ฃผ์ (์: ๋ฏธ๋์ด ๋ฆฌํฐ๋ฌ์, ๊ฐ์ง๋ด์ค ํ๋ณ)๋ก ํ์ฅํ๋ ์ ๋ต์ ๊ณ ๋ คํ ์ ์์ต๋๋ค.
|
| 306 |
+
|
| 307 |
+
2. **๊ฒ์์์ง ์ต์ ํ(SEO) ๊ณ ๋ํ**:
|
| 308 |
+
- **์ฝํ
์ธ -ํ๊ทธ ์ฐ๊ณ**: Word Cloud ๋ถ์์์ ๋์ถ๋ '#AI', '#๋์งํธ', '#ํ๋ซํผ' ๋ฑ์ ์ธ๊ธฐ ๊ธฐ์ ํ๊ทธ์ '์ปค๋ฒ์คํ ๋ฆฌ', '์ง์ค์ ๊ฒ'๊ณผ ๊ฐ์ ์ธ๊ธฐ ์นดํ
๊ณ ๋ฆฌ๋ฅผ ์กฐํฉํ ์ฝํ
์ธ ๋ฅผ ๊ธฐํํ์ฌ ๊ฒ์ ๋
ธ์ถ ๊ฐ๋ฅ์ฑ์ ๊ทน๋ํํด์ผ ํฉ๋๋ค.
|
| 309 |
+
- **๋ธ๋ก๊ทธ ์ฑ๋ ํ์ฉ**: '๋ค์ด๋ฒ ๋ธ๋ก๊ทธ'๊ฐ ์์ง์ ๋
์๋ฅผ ์ ์
์ํค๋ ํต์ฌ ์ฑ๋์์ด ํ์ธ๋์์ต๋๋ค. ์นด๋๋ด์ค๋ ๊ธฐ์ฌ ์์ฝ๋ณธ ๋ฑ ๋ธ๋ก๊ทธ ํ๋ซํผ์ ์ต์ ํ๋ 2์ฐจ ์ฝํ
์ธ ๋ฅผ ์ ์ํ์ฌ ๋ฐฐํฌํ๋ ์ ๋ต์ด ์ ํจํฉ๋๋ค.
|
| 310 |
+
|
| 311 |
+
3. **๋
์ ์ฐธ์ฌ๋ ์ฆ์ง ์ ๋ต**:
|
| 312 |
+
- **์ฐธ์ฌ๋ ๋์ ์นดํ
๊ณ ๋ฆฌ ๋ฒค์น๋งํน**: '๊ธ๋ก๋ฒ ๋ฏธ๋์ด ํ์ฅ', '๋ฏธ๋์ด ๋ฆฌ๋ทฐ' ๋ฑ ์ฐธ์ฌ๋๊ฐ ๋์ ์นดํ
๊ณ ๋ฆฌ์ ํ์(์: ์ ๋ฌธ๊ฐ ์ธํฐ๋ทฐ, ํน์ ์ฌ๋ก ์ฌ์ธต ๋ถ์, ๋ช
ํํ ์ฃผ์ฅ ์ ์)์ ๋ค๋ฅธ ๊ธฐ์ฌ์ ์ ์ฉํด ๋ณผ ์ ์์ต๋๋ค.
|
| 313 |
+
- **์ธํฐ๋ํฐ๋ธ ์์ ๋์
**: ๊ธฐ์ฌ ๋ง๋ฏธ์ ๊ด๋ จ ์ฃผ์ ์ ๋ํ ๋
์ ์๊ฒฌ์ ๋ฌป๋ ์ง๋ฌธ์ ์ถ๊ฐํ๊ฑฐ๋, ํฌํ ๊ธฐ๋ฅ์ ํ์ฉํ์ฌ ๋๊ธ ๋ฐ ์ํธ์์ฉ์ ์ ๋ํ๋ ๋ฐฉ์์ ๊ฒํ ํด์ผ ํฉ๋๋ค.
|
| 314 |
+
"""
|
| 315 |
+
# ๋ฆฌํฌํธ ํ์ผ๋ก ์ ์ฅ
|
| 316 |
+
report_path = f'{output_dir}/comprehensive_analysis_report.txt'
|
| 317 |
+
with open(report_path, 'w', encoding='utf-8') as f:
|
| 318 |
+
f.write(report)
|
| 319 |
+
|
| 320 |
+
print(f" - ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ ์๋ฃ. ({report_path} ์ ์ฅ)")
|
| 321 |
+
|
| 322 |
+
# 6. ๋ฉ์ธ ์คํ ํจ์
|
| 323 |
+
def main():
|
| 324 |
+
"""์คํฌ๋ฆฝํธ์ ๋ฉ์ธ ์คํ ๋ก์ง"""
|
| 325 |
+
print("===== ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ ์คํฌ๋ฆฝํธ ์คํ =====")
|
| 326 |
+
|
| 327 |
+
# 1. ํ๊ฒฝ ์ค์
|
| 328 |
+
data_dir, output_dir = setup_environment()
|
| 329 |
+
|
| 330 |
+
# 2. ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ
|
| 331 |
+
all_data = load_and_preprocess_data(data_dir)
|
| 332 |
+
|
| 333 |
+
# 3. ์์ธ ๋ถ์ ๋ฐ ์๊ฐํ ์คํ
|
| 334 |
+
analyze_metrics_overview(all_data['merged'], output_dir)
|
| 335 |
+
analyze_content_features(all_data['merged'], output_dir)
|
| 336 |
+
analyze_demographics(all_data['demo'], all_data['merged'], output_dir)
|
| 337 |
+
analyze_referrer(all_data['referrer'], all_data['merged'], output_dir)
|
| 338 |
+
|
| 339 |
+
# 4. ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ
|
| 340 |
+
generate_insights_report(all_data, output_dir)
|
| 341 |
+
|
| 342 |
+
print("\n===== ๋ชจ๋ ๋ถ์์ด ์ฑ๊ณต์ ์ผ๋ก ์๋ฃ๋์์ต๋๋ค. =====")
|
| 343 |
+
print(f"๊ฒฐ๊ณผ๋ฌผ์ '{output_dir}' ํด๋์์ ํ์ธํ์ค ์ ์์ต๋๋ค.")
|
| 344 |
+
|
| 345 |
+
if __name__ == '__main__':
|
| 346 |
+
main()
|
analysis2.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต EDA (AI ๋ชจ๋ธ ํ๋น์ฑ ๊ฒ์ฆ ๊ด์ ์ถ๊ฐ)
|
| 4 |
+
|
| 5 |
+
๊ธฐ์กด ๋ถ์์ ๋ํด, AI ์ ๋ชฉ/์ค๋ช
์์ฑ ๋ฐ RAG ๊ธฐ๋ฐ ์ฑ๊ณผ ์์ธก ๋ชจ๋ธ์
|
| 6 |
+
ํ์์ฑ๊ณผ ํ๋น์ฑ์ ๋ฐ์ดํฐ๋ก ์ฆ๋ช
ํ๊ธฐ ์ํ ๋ถ์์ ์ถ๊ฐํฉ๋๋ค.
|
| 7 |
+
|
| 8 |
+
์ถ๊ฐ ๋ถ์ ๋ด์ฉ:
|
| 9 |
+
- ์ฑ๊ณต์ ์ธ ๊ธฐ์ฌ ์ ๋ชฉ์ ๊ตฌ์กฐ์ ํน์ง ๋ถ์ (๊ธธ์ด, ํค์๋ ํฌํจ ์ฌ๋ถ ๋ฑ)
|
| 10 |
+
- RAG ๋ชจ๋ธ์ ๊ทผ๊ฑฐ ๋ง๋ จ์ ์ํ '์ฃผ์ ๊ตฐ์ง๋ณ ์ฑ๊ณต๋ฅ ' ๋ถ์
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
# 1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ (๊ธฐ์กด๊ณผ ๋์ผ)
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import numpy as np
|
| 16 |
+
import matplotlib.pyplot as plt
|
| 17 |
+
import seaborn as sns
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
import warnings
|
| 20 |
+
import os
|
| 21 |
+
from wordcloud import WordCloud
|
| 22 |
+
|
| 23 |
+
warnings.filterwarnings('ignore')
|
| 24 |
+
|
| 25 |
+
# 2. ๊ธฐ๋ณธ ์ค์ ๋ฐ ์ ์ญ ๋ณ์ (๊ธฐ์กด๊ณผ ๋์ผ)
|
| 26 |
+
def setup_environment():
|
| 27 |
+
DATA_DIR = r'Broadcast_paper\data_csv'
|
| 28 |
+
OUTPUT_DIR = r'./output_analysis_v2' # ๊ฒฐ๊ณผ ์ ์ฅ ํด๋ ๋ณ๊ฒฝ
|
| 29 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 30 |
+
os.makedirs(OUTPUT_DIR)
|
| 31 |
+
print(f"'{OUTPUT_DIR}' ํด๋๋ฅผ ์์ฑํ์ต๋๋ค.")
|
| 32 |
+
plt.rc('font', family='Malgun Gothic')
|
| 33 |
+
plt.rcParams['axes.unicode_minus'] = False
|
| 34 |
+
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
|
| 35 |
+
print("๋ถ์ ํ๊ฒฝ ์ค์ ์๋ฃ!")
|
| 36 |
+
return DATA_DIR, OUTPUT_DIR
|
| 37 |
+
|
| 38 |
+
# 3. ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ (๊ธฐ์กด๊ณผ ๋์ผ)
|
| 39 |
+
def load_and_preprocess_data(data_dir):
|
| 40 |
+
print("\n[๋จ๊ณ 1] ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์์...")
|
| 41 |
+
df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
|
| 42 |
+
df_contents = pd.read_csv(f'{data_dir}/contents.csv')
|
| 43 |
+
df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
|
| 44 |
+
df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
|
| 45 |
+
|
| 46 |
+
df_metrics['period'] = pd.to_datetime(df_metrics['period'])
|
| 47 |
+
df_metrics['comments'].fillna(0, inplace=True)
|
| 48 |
+
df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
|
| 49 |
+
df_contents['date'] = pd.to_datetime(df_contents['date'])
|
| 50 |
+
df_contents['publish_month'] = df_contents['date'].dt.to_period('M')
|
| 51 |
+
df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
|
| 52 |
+
df_contents['content_length'] = df_contents['content'].str.len()
|
| 53 |
+
df_demo_filtered = df_demo[df_demo['age_group'] != '์ ์ฒด'].copy()
|
| 54 |
+
|
| 55 |
+
article_total_metrics = df_metrics.groupby('article_id').agg({
|
| 56 |
+
'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
|
| 57 |
+
}).reset_index()
|
| 58 |
+
|
| 59 |
+
df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
|
| 60 |
+
df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
|
| 61 |
+
df_merged['engagement_rate'] = ((df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)) * 100
|
| 62 |
+
|
| 63 |
+
print("๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์๋ฃ!")
|
| 64 |
+
return {
|
| 65 |
+
"metrics": df_metrics, "contents": df_contents, "demo": df_demo_filtered,
|
| 66 |
+
"referrer": df_referrer, "merged": df_merged
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# 4. ์์ธ ๋ถ์ ๋ฐ ์๊ฐํ ํจ์๋ค
|
| 70 |
+
# (analyze_metrics_overview, analyze_content_features, analyze_demographics, analyze_referrer ํจ์๋ ๊ธฐ์กด๊ณผ ๋์ผํ๊ฒ ์ ์ง)
|
| 71 |
+
|
| 72 |
+
# ==============================================================================
|
| 73 |
+
# โ
โ
โ
โ
โ
AI ๋ชจ๋ธ ํ๋น์ฑ ๊ฒ์ฆ์ ์ํ ์ ๊ท ๋ถ์ ํจ์ โ
โ
โ
โ
โ
|
| 74 |
+
# ==============================================================================
|
| 75 |
+
|
| 76 |
+
def analyze_title_performance(df_merged, output_dir):
|
| 77 |
+
"""
|
| 78 |
+
์ ๋ชฉ์ ํน์ฑ(๊ธธ์ด, ํค์๋, ์ซ์, ์ง๋ฌธ ํ์)์ด ๊ธฐ์ฌ ์ฑ๊ณผ์ ๋ฏธ์น๋ ์ํฅ์ ๋ถ์ํฉ๋๋ค.
|
| 79 |
+
์ด๋ 'AI๋ฅผ ํตํ ์ ๋ชฉ ์ต์ ํ'์ ํ์์ฑ์ ๋ท๋ฐ์นจํฉ๋๋ค.
|
| 80 |
+
"""
|
| 81 |
+
print("\n[์ ๊ท ๋ถ์ 1] ์ ๋ชฉ ํน์ฑ๊ณผ ๊ธฐ์ฌ ์ฑ๊ณผ ์ฐ๊ด์ฑ ๋ถ์...")
|
| 82 |
+
|
| 83 |
+
# 1. ํผ์ฒ ์์ง๋์ด๋ง
|
| 84 |
+
df_copy = df_merged.copy()
|
| 85 |
+
df_copy['title_length'] = df_copy['title'].str.len()
|
| 86 |
+
|
| 87 |
+
# ์์ 20๊ฐ ํ๊ทธ๋ฅผ ํต์ฌ ํค์๋๋ก ์ ์
|
| 88 |
+
tags = df_copy['tag'].dropna().str.split(',').explode().str.strip()
|
| 89 |
+
top_20_tags = tags.value_counts().head(20).index.str.replace('#', '')
|
| 90 |
+
|
| 91 |
+
df_copy['has_keyword_in_title'] = df_copy['title'].apply(
|
| 92 |
+
lambda x: any(tag in x for tag in top_20_tags)
|
| 93 |
+
)
|
| 94 |
+
df_copy['has_number_in_title'] = df_copy['title'].str.contains(r'\d')
|
| 95 |
+
df_copy['is_question_title'] = df_copy['title'].str.endswith('?')
|
| 96 |
+
|
| 97 |
+
# 2. ์๊ฐํ
|
| 98 |
+
fig, axes = plt.subplots(2, 2, figsize=(20, 14))
|
| 99 |
+
fig.suptitle('์ ๋ชฉ ํน์ฑ์ ๋ฐ๋ฅธ ๊ธฐ์ฌ ์ฑ๊ณผ ๋ถ์ (ํ๊ท ์กฐํ์)', fontsize=20, y=1.02)
|
| 100 |
+
|
| 101 |
+
# ์ ๋ชฉ ๊ธธ์ด
|
| 102 |
+
df_copy['title_len_group'] = pd.qcut(df_copy['title_length'], q=4, labels=['๋งค์ฐ ์งง์', '์งง์', '๊น', '๋งค์ฐ ๊น'])
|
| 103 |
+
sns.barplot(data=df_copy, x='title_len_group', y='views_total', ax=axes[0, 0], palette='viridis', ci=None)
|
| 104 |
+
axes[0, 0].set_title('์ ๋ชฉ ๊ธธ์ด๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 105 |
+
axes[0, 0].set_xlabel('์ ๋ชฉ ๊ธธ์ด ๊ทธ๋ฃน')
|
| 106 |
+
axes[0, 0].set_ylabel('ํ๊ท ์กฐํ์')
|
| 107 |
+
|
| 108 |
+
# ํต์ฌ ํค์๋ ํฌํจ ์ฌ๋ถ
|
| 109 |
+
sns.barplot(data=df_copy, x='has_keyword_in_title', y='views_total', ax=axes[0, 1], palette='plasma', ci=None)
|
| 110 |
+
axes[0, 1].set_title('์ ๋ชฉ ๋ด ํต์ฌ ํค์๋ ํฌํจ ์ฌ๋ถ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 111 |
+
axes[0, 1].set_xlabel('ํต์ฌ ํค์๋ ํฌํจ ์ฌ๋ถ')
|
| 112 |
+
axes[0, 1].set_ylabel('')
|
| 113 |
+
|
| 114 |
+
# ์ซ์ ํฌํจ ์ฌ๋ถ
|
| 115 |
+
sns.barplot(data=df_copy, x='has_number_in_title', y='views_total', ax=axes[1, 0], palette='magma', ci=None)
|
| 116 |
+
axes[1, 0].set_title('์ ๋ชฉ ๋ด ์ซ์ ํฌํจ ์ฌ๋ถ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 117 |
+
axes[1, 0].set_xlabel('์ซ์ ํฌํจ ์ฌ๋ถ')
|
| 118 |
+
axes[1, 0].set_ylabel('ํ๊ท ์กฐํ์')
|
| 119 |
+
|
| 120 |
+
# ์ง๋ฌธ ํ์ ์ฌ๋ถ
|
| 121 |
+
sns.barplot(data=df_copy, x='is_question_title', y='views_total', ax=axes[1, 1], palette='cividis', ci=None)
|
| 122 |
+
axes[1, 1].set_title('์ง๋ฌธ ํ์ ์ ๋ชฉ ์ฌ๋ถ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 123 |
+
axes[1, 1].set_xlabel('์ง๋ฌธ ํ์ ์ฌ๋ถ')
|
| 124 |
+
axes[1, 1].set_ylabel('')
|
| 125 |
+
|
| 126 |
+
plt.tight_layout()
|
| 127 |
+
plt.savefig(f'{output_dir}/title_characteristics_performance.png')
|
| 128 |
+
plt.close()
|
| 129 |
+
print(" - ์ ๋ชฉ ํน์ฑ ๋ถ์ ์๋ฃ. (title_characteristics_performance.png ์ ์ฅ)")
|
| 130 |
+
|
| 131 |
+
def analyze_topic_clusters_for_rag(df_merged, output_dir):
|
| 132 |
+
"""
|
| 133 |
+
์ฃผ์ (์นดํ
๊ณ ๋ฆฌ)๋ณ๋ก ์ฑ๊ณต์ ์ธ ๊ธฐ์ฌ๊ฐ ์ผ๋ง๋ ์ง์ค๋์ด ์๋์ง ๋ถ์ํฉ๋๋ค.
|
| 134 |
+
์ด๋ '์ ์ฌํ ๊ณผ๊ฑฐ ์ฑ๊ณต ๊ธฐ์ฌ'๋ฅผ ์ฐธ์กฐํ๋ RAG ๋ชจ๋ธ์ ์์ธก ํ๋น์ฑ์ ๋ท๋ฐ์นจํฉ๋๋ค.
|
| 135 |
+
"""
|
| 136 |
+
print("\n[์ ๊ท ๋ถ์ 2] ์ฃผ์ ๊ตฐ์ง๋ณ ์ฑ๊ณต๋ฅ ๋ถ์ (RAG ๋ชจ๋ธ ๊ทผ๊ฑฐ ๋ง๋ จ)...")
|
| 137 |
+
|
| 138 |
+
# 1. '์ฑ๊ณต ๊ธฐ์ฌ' ์ ์ (์์ 20% ์กฐํ์)
|
| 139 |
+
df_copy = df_merged.copy()
|
| 140 |
+
performance_threshold = df_copy['views_total'].quantile(0.8)
|
| 141 |
+
df_copy['is_high_performing'] = df_copy['views_total'] >= performance_threshold
|
| 142 |
+
|
| 143 |
+
# 2. ์นดํ
๊ณ ๋ฆฌ๋ณ ๊ธฐ์ฌ ์ ๋ฐ ์ฑ๊ณต ๊ธฐ์ฌ ์ ์ง๊ณ
|
| 144 |
+
category_success = df_copy.groupby('category').agg(
|
| 145 |
+
total_articles=('article_id', 'count'),
|
| 146 |
+
high_performing_articles=('is_high_performing', 'sum')
|
| 147 |
+
).reset_index()
|
| 148 |
+
|
| 149 |
+
# 3. ์นดํ
๊ณ ๋ฆฌ๋ณ ์ฑ๊ณต๋ฅ ๊ณ์ฐ
|
| 150 |
+
category_success['success_rate'] = (category_success['high_performing_articles'] / category_success['total_articles']) * 100
|
| 151 |
+
category_success = category_success.sort_values('success_rate', ascending=False)
|
| 152 |
+
|
| 153 |
+
# 4. ์๊ฐํ
|
| 154 |
+
plt.figure(figsize=(14, 10))
|
| 155 |
+
sns.barplot(data=category_success, y='category', x='success_rate', palette='coolwarm')
|
| 156 |
+
plt.title('์นดํ
๊ณ ๋ฆฌ๋ณ ์์ 20% ์ฑ๊ณผ ๊ธฐ์ฌ ๋น์จ (์ฑ๊ณต๋ฅ )', fontsize=18)
|
| 157 |
+
plt.xlabel('์ฑ๊ณต๋ฅ (%)')
|
| 158 |
+
plt.ylabel('์นดํ
๊ณ ๋ฆฌ')
|
| 159 |
+
plt.axvline(x=20, color='red', linestyle='--', label='์ ์ฒด ํ๊ท ์ฑ๊ณต๋ฅ (20%)')
|
| 160 |
+
plt.legend()
|
| 161 |
+
plt.tight_layout()
|
| 162 |
+
plt.savefig(f'{output_dir}/topic_cluster_success_rate.png')
|
| 163 |
+
plt.close()
|
| 164 |
+
print(" - ์ฃผ์ ๊ตฐ์ง๋ณ ์ฑ๊ณต๋ฅ ๋ถ์ ์๋ฃ. (topic_cluster_success_rate.png ์ ์ฅ)")
|
| 165 |
+
|
| 166 |
+
# 5. ์ข
ํฉ ์ธ์ฌ์ดํธ ์์ฑ (๋ณด๊ณ ์ ๋ด์ฉ ์
๋ฐ์ดํธ)
|
| 167 |
+
def generate_insights_report(data, output_dir):
|
| 168 |
+
print("\n[๋จ๊ณ 6] ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ (AI ๋ชจ๋ธ ๊ฒ์ฆ ๋ด์ฉ ์ถ๊ฐ)...")
|
| 169 |
+
|
| 170 |
+
report = f"""
|
| 171 |
+
# ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ ๋ณด๊ณ ์ (AI ๋ชจ๋ธ ๋์
ํ๋น์ฑ ์ค์ฌ)
|
| 172 |
+
์์ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 173 |
+
|
| 174 |
+
## 1. ๋ถ์ ๊ฐ์
|
| 175 |
+
- ๋ณธ ๋ณด๊ณ ์๋ ๊ธฐ์ฌ ์ฑ๊ณผ, ๋
์ ํน์ฑ, ์ ์
๊ฒฝ๋ก ๋ฐ์ดํฐ๋ฅผ ๋ถ์ํ์ฌ **AI ๊ธฐ๋ฐ ์ฝํ
์ธ ๊ฐ์ธํ ์์คํ
** ๋์
์ ํ์์ฑ๊ณผ ํ๋น์ฑ์ ๋ฐ์ดํฐ ๊ธฐ๋ฐ์ผ๋ก ์ฆ๋ช
ํ๋ ๊ฒ์ ๋ชฉํ๋ก ํฉ๋๋ค.
|
| 176 |
+
|
| 177 |
+
## 2. ์ฃผ์ ๋ถ์ ๊ฒฐ๊ณผ (Key Findings)
|
| 178 |
+
(๊ธฐ์กด 2.1 ~ 2.3 ๋ด์ฉ ์๋ต)
|
| 179 |
+
...
|
| 180 |
+
|
| 181 |
+
## 3. โ
AI ๊ธฐ๋ฐ ์ ๋ชฉ ์ถ์ฒ ๋ฐ ์ฑ๊ณผ ์์ธก ๋ชจ๋ธ์ ํ๋น์ฑ ๊ฒ์ฆ โ
|
| 182 |
+
|
| 183 |
+
### 3.1. ์ AI ์ ๋ชฉ ์ถ์ฒ์ด ํ์ํ๊ฐ?: ์ฑ๊ณตํ๋ ์ ๋ชฉ์๋ ํจํด์ด ์๋ค.
|
| 184 |
+
- **๋ฐ์ดํฐ ์ฆ๊ฑฐ**: ์ ๋ชฉ์ ๊ตฌ์กฐ์ ํน์ฑ์ด ํ๊ท ์กฐํ์์ ์ ์๋ฏธํ ์ํฅ์ ๋ฏธ์น๋ ๊ฒ์ผ๋ก ๋ํ๋ฌ์ต๋๋ค. (title_characteristics_performance.png ์ฐธ๊ณ )
|
| 185 |
+
- **๊ธธ์ด**: '๊น' ๋๋ '๋งค์ฐ ๊น' ๊ทธ๋ฃน์ ์ ๋ชฉ์ด ์งง์ ์ ๋ชฉ๋ณด๋ค ๋์ ์กฐํ์๋ฅผ ๊ธฐ๋กํ๋ ๊ฒฝํฅ์ ๋ณด์์ต๋๋ค. ์ด๋ ๋
์์ ํฅ๋ฏธ๋ฅผ ๋๊ธฐ ์ํด ์ถฉ๋ถํ ์ ๋ณด๋ ๋งฅ๋ฝ์ ์ ๊ณตํ๋ ๊ฒ์ด ์ ๋ฆฌํจ์ ์์ฌํฉ๋๋ค.
|
| 186 |
+
- **ํต์ฌ ํค์๋**: '#๋ฏธ๋์ด', '#AI' ๋ฑ ์์ ํ๊ทธ๊ฐ ํฌํจ๋ ์ ๋ชฉ์ ๊ธฐ์ฌ๋ ๊ทธ๋ ์ง ์์ ๊ธฐ์ฌ๋ณด๋ค **ํ๊ท ์กฐํ์๊ฐ ์๋ฑํ ๋์์ต๋๋ค.** ์ด๋ ๋
์๋ค์ด ์ต์ํ๊ณ ๊ด์ฌ ์๋ ํค์๋์ ์ฆ๊ฐ์ ์ผ๋ก ๋ฐ์ํจ์ ์๋ฏธํฉ๋๋ค.
|
| 187 |
+
- **์ซ์ ๋ฐ ํ์**: ์ ๋ชฉ์ 'TOP 5', '3๊ฐ์ง ์ด์ ' ๋ฑ ์ซ์๋ฅผ ํฌํจํ๊ฑฐ๋, '~๋ ๋ฌด์์ธ๊ฐ?'์ ๊ฐ์ ์ง๋ฌธ ํ์์ ์ ๋ชฉ์ด ๋
์์ ์ฃผ๋ชฉ์ ๋๋ ๋ฐ ํจ๊ณผ์ ์ด์์ต๋๋ค.
|
| 188 |
+
- **๊ฒฐ๋ก **: ์ด์ฒ๋ผ ์ฑ๊ณต์ ์ธ ์ ๋ชฉ์ ํจํด์ ๋ถ์ํ๊ณ ์ด๋ฅผ ์ ๊ท ๊ธฐ์ฌ์ ์ผ๊ด๋๊ฒ ์ ์ฉํ๋ ๊ฒ์ ๋งค์ฐ ์ค์ํฉ๋๋ค. **AI ์ถ์ฒ ๋ชจ๋ธ์ ์ด๋ฌํ ์ต์ ์ ํจํด์ ๋ฐ์ดํฐ ๊ธฐ๋ฐ์ผ๋ก ํ์ตํ์ฌ, ์๋ํฐ์ ์ฃผ๏ฟฝ๏ฟฝ๏ฟฝ์ ์์กดํ์ง ์๊ณ ๊พธ์คํ ๋์ ์ฑ๊ณผ๋ฅผ ๋ด๋ ์ ๋ชฉ ์์ฑ์ ์๋ํ**ํ ์ ์์ต๋๋ค.
|
| 189 |
+
|
| 190 |
+
### 3.2. ์ RAG ๊ธฐ๋ฐ ์ฑ๊ณผ ์์ธก์ด ์ ๋ขฐํ ์ ์๋๊ฐ?: ์ฑ๊ณต์ ํน์ ์ฃผ์ ์ ์ง์ค๋๋ค.
|
| 191 |
+
- **๋ฐ์ดํฐ ์ฆ๊ฑฐ**: ๊ธฐ์ฌ์ ์ฑ๊ณต์ ๋ฌด์์๋ก ๋ฐ์ํ์ง ์๊ณ , ํน์ **์ฃผ์ (์นดํ
๊ณ ๋ฆฌ) ๋ด์์ ๋์ ์ง์ค๋**๋ฅผ ๋ณด์์ต๋๋ค. (topic_cluster_success_rate.png ์ฐธ๊ณ )
|
| 192 |
+
- **'์ฑ๊ณต๋ฅ ' ์์ ์นดํ
๊ณ ๋ฆฌ**: '๋ฏธ๋์ด ไบบ์ฌ์ด๋', '๋ฏธ๋์ดยทAIํธ๋ ๋', '์์ด๋์ด์ค' ๋ฑ์ ์นดํ
๊ณ ๋ฆฌ๋ ์ ์ฒด ๊ธฐ์ฌ ์ค ์์ 20%์ ์ฑ๊ณผ๋ฅผ ๋ด๋ '์ฑ๊ณต ๊ธฐ์ฌ'์ ๋น์จ์ด 30%๋ฅผ ์ํํ์ต๋๋ค. ์ด๋ ์ด ์ฃผ์ ์์ฒด๊ฐ ๋
์๋ค์ ๋์ ๊ด์ฌ์ ๋ณด์ฅํ๋ **'์ฑ๊ณต ๋ณด์ฆ ์ํ'**์ ๊ฐ๊น๋ค๋ ๊ฒ์ ์๋ฏธํฉ๋๋ค.
|
| 193 |
+
- **'์ฑ๊ณต๋ฅ ' ํ์ ์นดํ
๊ณ ๋ฆฌ**: ๋ฐ๋ฉด, ์ผ๋ถ ์นดํ
๊ณ ๋ฆฌ๋ ์ฑ๊ณต๋ฅ ์ด 10% ๋ฏธ๋ง์ผ๋ก, ๋์ผํ ๋
ธ๋ ฅ์ ํฌ์
ํด๋ ๋์ ์ฑ๊ณผ๋ฅผ ๊ธฐ๋ํ๊ธฐ ์ด๋ ค์์ ๋ณด์ฌ์ค๋๋ค.
|
| 194 |
+
- **๊ฒฐ๋ก **: ๊ธฐ์ฌ์ ์ฑ๊ณต ์ฌ๋ถ๋ ํด๋น ๊ธฐ์ฌ๊ฐ ์ด๋ค **'์ฃผ์ ๊ตฐ์ง'**์ ์ํ๋์ง์ ๋ฐ์ ํ ๊ด๋ จ์ด ์์ต๋๋ค. ๋ฐ๋ผ์ **RAG ๋ชจ๋ธ์ด ์๋ก์ด ๊ธฐ์ฌ์ '์ ์ฌํ ๊ณผ๊ฑฐ ์ฑ๊ณต ์ฌ๋ก'๋ฅผ ์ฐพ์ ๊ทธ ์ฑ๊ณผ๋ฅผ ๋ฐํ์ผ๋ก ๋ฏธ๋๋ฅผ ์์ธกํ๋ ๋ฐฉ์์ ๋ฐ์ดํฐ์ ์ผ๋ก ๋งค์ฐ ํ๋น**ํฉ๋๋ค. ์ฑ๊ณต๋ฅ ์ด ๋์ ๊ตฐ์ง์ ๊ธฐ์ฌ์ ์ ์ฌํ๋ค๋ฉด ๋์ ๋
์ ์๋ฅผ, ๊ทธ๋ ์ง ์๋ค๋ฉด ๋ฎ์ ๋
์ ์๋ฅผ ์์ธกํ๋ ๊ฒ์ด ํฉ๋ฆฌ์ ์
๋๋ค.
|
| 195 |
+
|
| 196 |
+
## 4. ์ ๋ต์ ์ ์ธ (AI ์์คํ
๋์
์ ์ค์ฌ์ผ๋ก)
|
| 197 |
+
|
| 198 |
+
1. **AI ์ ๋ชฉ/์ค๋ช
์์ฑ๊ธฐ ๋์
**: EDA๋ฅผ ํตํด ๊ฒ์ฆ๋ **'์ฑ๊ณตํ๋ ์ ๋ชฉ ํจํด'(์ ์ ํ ๊ธธ์ด, ํต์ฌ ํค์๋, ์ซ์/์ง๋ฌธ ํ์ฉ)์ AI ๋ชจ๋ธ์ ํ์ต**์์ผ ๋ชจ๋ ์ ๊ท ์ฝํ
์ธ ์ ์ ๋ชฉ๊ณผ ์ค๋ช
์ ์๋์ผ๋ก ์์ฑ ๋ฐ ์ถ์ฒ๋ฐ์์ผ ํฉ๋๋ค. ์ด๋ฅผ ํตํด ์ฝํ
์ธ ์ฑ๊ณผ์ ์ํฅ ํ์คํ๋ฅผ ๊ธฐ๋ํ ์ ์์ต๋๋ค.
|
| 199 |
+
|
| 200 |
+
2. **RAG ์์ธก ๋ชจ๋ธ์ ํ์ฉํ '์ ํ๊ณผ ์ง์ค'**: ๊ธฐ์ฌ ๊ธฐํ ๋จ๊ณ์์ **ํต์ฌ ์ฃผ์ ์ ์์ ์ ๋ชฉ์ RAG ๋ชจ๋ธ์ ์
๋ ฅํ์ฌ '์์ ๋
์ ์'๋ฅผ ๋ฏธ๋ฆฌ ํ์ธ**ํด์ผ ํฉ๋๋ค.
|
| 201 |
+
- ์์ธก ๋
์ ์๊ฐ ๋์ ๊ธฐํ์์ ๋ฆฌ์์ค๋ฅผ ์ง์คํ์ฌ ์ฐ์ ์ ์ผ๋ก ๋ฐํํ๊ณ , ์์ธก์น๊ฐ ๋ฎ์ ๊ธฐํ์์ ๋
์ ๊ด์ฌ๋๊ฐ ๋์ ์ฃผ์ ์ ๊ฒฐํฉํ๊ฑฐ๋ ์ ๋ชฉ ํจํด์ ์์ ํ๋ ๋ฑ **'๋ฐ์ดํฐ ๊ธฐ๋ฐ ์์ฌ๊ฒฐ์ '**์ ํตํด ์คํจ ํ๋ฅ ์ ์ค์ฌ์ผ ํฉ๋๋ค.
|
| 202 |
+
|
| 203 |
+
3. **A/B ํ
์คํธ๋ฅผ ํตํ ๋ชจ๋ธ ๊ณ ๋ํ**: AI๊ฐ ์ถ์ฒํ ์ฌ๋ฌ ์ ๋ชฉ ํ๋ณด๊ตฐ์ ๋์์ผ๋ก A/B ํ
์คํธ๋ฅผ ์งํํ๊ณ , ์ค์ ์ฑ๊ณผ ๋ฐ์ดํฐ๋ฅผ ๋ค์ ๋ชจ๋ธ์ ํ์ต์์ผ ์ง์์ ์ผ๋ก ์ถ์ฒ ๋ฐ ์์ธก ์ ํ๋๋ฅผ ๋์ฌ๋๊ฐ์ผ ํฉ๋๋ค.
|
| 204 |
+
"""
|
| 205 |
+
report_path = f'{output_dir}/comprehensive_analysis_report_for_ai_validation.txt'
|
| 206 |
+
with open(report_path, 'w', encoding='utf-8') as f:
|
| 207 |
+
f.write(report)
|
| 208 |
+
print(f" - ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ ์๋ฃ. ({report_path} ์ ์ฅ)")
|
| 209 |
+
|
| 210 |
+
# 6. ๋ฉ์ธ ์คํ ํจ์
|
| 211 |
+
def main():
|
| 212 |
+
print("===== ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ ์คํฌ๋ฆฝํธ ์คํ (AI ๋ชจ๋ธ ๊ฒ์ฆ ๊ด์ ) =====")
|
| 213 |
+
|
| 214 |
+
data_dir, output_dir = setup_environment()
|
| 215 |
+
all_data = load_and_preprocess_data(data_dir)
|
| 216 |
+
|
| 217 |
+
# --- ๊ธฐ์กด ๋ถ์ ์คํ (ํ์ ์ ์ฃผ์ ํด์ ) ---
|
| 218 |
+
# analyze_metrics_overview(all_data['merged'], output_dir)
|
| 219 |
+
# analyze_content_features(all_data['merged'], output_dir)
|
| 220 |
+
# analyze_demographics(all_data['demo'], all_data['merged'], output_dir)
|
| 221 |
+
# analyze_referrer(all_data['referrer'], all_data['merged'], output_dir)
|
| 222 |
+
|
| 223 |
+
# --- โ
์ ๊ท ๋ถ์ ์คํ โ
---
|
| 224 |
+
analyze_title_performance(all_data['merged'], output_dir)
|
| 225 |
+
analyze_topic_clusters_for_rag(all_data['merged'], output_dir)
|
| 226 |
+
|
| 227 |
+
generate_insights_report(all_data, output_dir)
|
| 228 |
+
|
| 229 |
+
print("\n===== ๋ชจ๋ ๋ถ์์ด ์ฑ๊ณต์ ์ผ๋ก ์๋ฃ๋์์ต๋๋ค. =====")
|
| 230 |
+
print(f"๊ฒฐ๊ณผ๋ฌผ์ '{output_dir}' ํด๋์์ ํ์ธํ์ค ์ ์์ต๋๋ค.")
|
| 231 |
+
|
| 232 |
+
if __name__ == '__main__':
|
| 233 |
+
main()
|
analysis3.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต EDA (์์น/์ถ์ธ ๊ฐ๋
์ฑ ๊ฐํ ์๋ณ ๋ถ์)
|
| 4 |
+
|
| 5 |
+
์๋ณ ๋์ ํธ๋ ๋ ๋ถ์์ ๊ฐํํ์ฌ, ๋ชจ๋ ์๊ฐํ ์๋ฃ์ ์ ํํ ์์น๋ฅผ
|
| 6 |
+
ํ์ํ๊ณ , ์ ์ ๋๋น ์ฑ์ฅ๋ฅ ์ ๋ช
์์ ์ผ๋ก ๋ณด์ฌ์ฃผ์ด ์ถ์ธ๋ฅผ ๋์ฑ ๋ช
ํํ๊ฒ
|
| 7 |
+
ํ์
ํ ์ ์๋๋ก ๊ฐ์ ํฉ๋๋ค.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# 1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ (๊ธฐ์กด๊ณผ ๋์ผ)
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import seaborn as sns
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import warnings
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
warnings.filterwarnings('ignore')
|
| 20 |
+
|
| 21 |
+
# --- ์๊ฐํ์ฉ ํฌํผ ํจ์ ---
|
| 22 |
+
def add_value_labels(ax, is_bar=True, fmt="{:.0f}"):
|
| 23 |
+
"""๋ง๋ ๋๋ ๊บพ์์ ๊ทธ๋ํ์ ๊ฐ ๋ ์ด๋ธ์ ์ถ๊ฐํ๋ ํจ์"""
|
| 24 |
+
for p in ax.patches if is_bar else ax.lines:
|
| 25 |
+
if is_bar:
|
| 26 |
+
ax.annotate(fmt.format(p.get_height()),
|
| 27 |
+
(p.get_x() + p.get_width() / 2., p.get_height()),
|
| 28 |
+
ha='center', va='center',
|
| 29 |
+
xytext=(0, 9),
|
| 30 |
+
textcoords='offset points',
|
| 31 |
+
fontsize=9,
|
| 32 |
+
color='dimgray')
|
| 33 |
+
else: # for line plots
|
| 34 |
+
for x_value, y_value in zip(p.get_xdata(), p.get_ydata()):
|
| 35 |
+
ax.text(x_value, y_value, fmt.format(y_value),
|
| 36 |
+
ha='center', va='bottom',
|
| 37 |
+
fontsize=9,
|
| 38 |
+
color='dimgray')
|
| 39 |
+
|
| 40 |
+
# 2. ๊ธฐ๋ณธ ์ค์ ๋ฐ ์ ์ญ ๋ณ์
|
| 41 |
+
def setup_environment():
|
| 42 |
+
DATA_DIR = r'Broadcast_paper\data_csv'
|
| 43 |
+
OUTPUT_DIR = r'./output_analysis_v4' # ๊ฒฐ๊ณผ ์ ์ฅ ํด๋ ๋ณ๊ฒฝ
|
| 44 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 45 |
+
os.makedirs(OUTPUT_DIR)
|
| 46 |
+
print(f"'{OUTPUT_DIR}' ํด๋๋ฅผ ์์ฑํ์ต๋๋ค.")
|
| 47 |
+
plt.rc('font', family='Malgun Gothic')
|
| 48 |
+
plt.rcParams['axes.unicode_minus'] = False
|
| 49 |
+
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
|
| 50 |
+
print("๋ถ์ ํ๊ฒฝ ์ค์ ์๋ฃ!")
|
| 51 |
+
return DATA_DIR, OUTPUT_DIR
|
| 52 |
+
|
| 53 |
+
# 3. ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ (๊ธฐ์กด๊ณผ ๋์ผ)
|
| 54 |
+
def load_and_preprocess_data(data_dir):
|
| 55 |
+
print("\n[๋จ๊ณ 1] ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์์...")
|
| 56 |
+
df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
|
| 57 |
+
df_contents = pd.read_csv(f'{data_dir}/contents.csv')
|
| 58 |
+
df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
|
| 59 |
+
df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
|
| 60 |
+
|
| 61 |
+
df_metrics['period'] = pd.to_datetime(df_metrics['period']).dt.to_period('M')
|
| 62 |
+
df_contents['publish_month'] = pd.to_datetime(df_contents['date']).dt.to_period('M')
|
| 63 |
+
df_demo['period'] = pd.to_datetime(df_demo['period']).dt.to_period('M')
|
| 64 |
+
df_referrer['period'] = pd.to_datetime(df_referrer['period']).dt.to_period('M')
|
| 65 |
+
|
| 66 |
+
df_metrics['comments'].fillna(0, inplace=True)
|
| 67 |
+
df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
|
| 68 |
+
df_contents['content_length'] = df_contents['content'].str.len()
|
| 69 |
+
df_demo_filtered = df_demo[df_demo['age_group'] != '์ ์ฒด'].copy()
|
| 70 |
+
|
| 71 |
+
article_total_metrics = df_metrics.groupby('article_id').agg({
|
| 72 |
+
'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
|
| 73 |
+
}).reset_index()
|
| 74 |
+
|
| 75 |
+
df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
|
| 76 |
+
df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
|
| 77 |
+
df_merged['engagement_rate'] = ((df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)) * 100
|
| 78 |
+
|
| 79 |
+
print("๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์๋ฃ!")
|
| 80 |
+
return {
|
| 81 |
+
"metrics": df_metrics, "contents": df_contents, "demo": df_demo_filtered,
|
| 82 |
+
"referrer": df_referrer, "merged": df_merged
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# ==============================================================================
|
| 86 |
+
# โ
โ
โ
โ
โ
์์น/์ถ์ธ ๊ฐ๋
์ฑ์ ๊ทน๋ํํ ์๋ณ ๋ถ์ ํจ์ โ
โ
โ
โ
โ
|
| 87 |
+
# ==============================================================================
|
| 88 |
+
def analyze_enhanced_monthly_trends(data, output_dir):
|
| 89 |
+
"""
|
| 90 |
+
์๊ฐ(์)์ ํ๋ฆ์ ๋ฐ๋ฅธ ์ฃผ์ ์งํ๋ค์ ๋์ ๋ณํ๋ฅผ ์์น์ ํจ๊ป ๋ช
ํํ๊ฒ ๋ถ์ํฉ๋๋ค.
|
| 91 |
+
"""
|
| 92 |
+
print("\n[์ ๊ท ๋ถ์ 4] ์๋ณ ๋์ ํธ๋ ๋ ์ฌ์ธต ๋ถ์ (์์น ๊ฐํ)...")
|
| 93 |
+
|
| 94 |
+
# --- 1. ์๋ณ ์ฑ๊ณผ ์งํ ๋ฐ ์ฑ์ฅ๋ฅ ---
|
| 95 |
+
monthly_metrics = data['metrics'].groupby('period').agg(
|
| 96 |
+
total_views=('views_total', 'sum'),
|
| 97 |
+
total_likes=('likes', 'sum'),
|
| 98 |
+
total_comments=('comments', 'sum')
|
| 99 |
+
).sort_index()
|
| 100 |
+
|
| 101 |
+
# ์ ์ ๋๋น ์ฑ์ฅ๋ฅ (MoM Growth) ๊ณ์ฐ
|
| 102 |
+
for col in monthly_metrics.columns:
|
| 103 |
+
monthly_metrics[f'{col}_mom'] = monthly_metrics[col].pct_change() * 100
|
| 104 |
+
|
| 105 |
+
monthly_metrics.index = monthly_metrics.index.to_timestamp()
|
| 106 |
+
|
| 107 |
+
fig, axes = plt.subplots(2, 1, figsize=(18, 14), sharex=True)
|
| 108 |
+
fig.suptitle('์๋ณ ์ฑ๊ณผ ์งํ ๋ฐ ์ ์ ๋๋น ์ฑ์ฅ๋ฅ (MoM) ์ถ์ด', fontsize=20, y=1.0)
|
| 109 |
+
|
| 110 |
+
# ์๋จ ๊ทธ๋ํ: ์ ๋ ์๏ฟฝ๏ฟฝ (์กฐํ์ + ์ข์์)
|
| 111 |
+
ax1 = axes[0]
|
| 112 |
+
bars = ax1.bar(monthly_metrics.index, monthly_metrics['total_views'], color='lightgray', label='์ด ์กฐํ์')
|
| 113 |
+
add_value_labels(ax1, is_bar=True, fmt="{:,.0f}") # ๋ง๋๊ทธ๋ํ ๊ฐ ํ์
|
| 114 |
+
ax1.set_ylabel('์ด ์กฐํ์', fontsize=12)
|
| 115 |
+
|
| 116 |
+
ax1_twin = ax1.twinx()
|
| 117 |
+
line1 = ax1_twin.plot(monthly_metrics.index, monthly_metrics['total_likes'], marker='o', color='coral', label='์ด ์ข์์')
|
| 118 |
+
add_value_labels(ax1_twin, is_bar=False, fmt="{:.0f}") # ๊บพ์์ ๊ฐ ํ์
|
| 119 |
+
ax1_twin.set_ylabel('์ด ์ข์์', fontsize=12)
|
| 120 |
+
|
| 121 |
+
# ๋ฒ๋ก ํฉ์น๊ธฐ
|
| 122 |
+
lines, labels = ax1.get_legend_handles_labels()
|
| 123 |
+
lines2, labels2 = ax1_twin.get_legend_handles_labels()
|
| 124 |
+
ax1_twin.legend(lines + lines2, labels + labels2, loc='upper left')
|
| 125 |
+
ax1.set_title('์๋ณ ์ด ์กฐํ์ ๋ฐ ์ข์์', fontsize=16)
|
| 126 |
+
|
| 127 |
+
# ํ๋จ ๊ทธ๋ํ: ์ฑ์ฅ๋ฅ (%)
|
| 128 |
+
ax2 = axes[1]
|
| 129 |
+
ax2.plot(monthly_metrics.index, monthly_metrics['total_views_mom'], marker='s', linestyle='--', label='์กฐํ์ ์ฑ์ฅ๋ฅ (%)')
|
| 130 |
+
ax2.plot(monthly_metrics.index, monthly_metrics['total_likes_mom'], marker='^', linestyle='--', label='์ข์์ ์ฑ์ฅ๋ฅ (%)')
|
| 131 |
+
ax2.axhline(0, color='red', linewidth=1, linestyle=':')
|
| 132 |
+
ax2.set_ylabel('์ ์ ๋๋น ์ฑ์ฅ๋ฅ (%)', fontsize=12)
|
| 133 |
+
ax2.legend()
|
| 134 |
+
ax2.set_title('์๋ณ ์ฃผ์ ์งํ ์ฑ์ฅ๋ฅ (MoM)', fontsize=16)
|
| 135 |
+
|
| 136 |
+
plt.tight_layout()
|
| 137 |
+
plt.savefig(f'{output_dir}/monthly_performance_and_growth.png')
|
| 138 |
+
plt.close()
|
| 139 |
+
print(" - ์๋ณ ์ฑ๊ณผ ๋ฐ ์ฑ์ฅ๋ฅ ๋ถ์ ์๋ฃ. (monthly_performance_and_growth.png ์ ์ฅ)")
|
| 140 |
+
|
| 141 |
+
# --- 2. ์๋ณ ์นดํ
๊ณ ๋ฆฌ ๋ฐํ ๋น์ค (์๊ฐํ + ๋ฐ์ดํฐ ํ
์ด๋ธ) ---
|
| 142 |
+
monthly_category_dist = data['merged'].groupby(['publish_month', 'category'])['article_id'].count().unstack().fillna(0)
|
| 143 |
+
monthly_category_prop = monthly_category_dist.div(monthly_category_dist.sum(axis=1), axis=0) * 100
|
| 144 |
+
|
| 145 |
+
top_categories = data['merged']['category'].value_counts().nlargest(7).index
|
| 146 |
+
other_categories = monthly_category_prop.columns.difference(top_categories)
|
| 147 |
+
monthly_category_prop['๊ธฐํ'] = monthly_category_prop[other_categories].sum(axis=1)
|
| 148 |
+
|
| 149 |
+
# ์๊ฐํ
|
| 150 |
+
monthly_category_prop[top_categories.tolist() + ['๊ธฐํ']].plot(
|
| 151 |
+
kind='bar', stacked=True, figsize=(16, 8), colormap='tab20c'
|
| 152 |
+
)
|
| 153 |
+
plt.title('์๋ณ ์ฝํ
์ธ ์นดํ
๊ณ ๋ฆฌ ๋ฐํ ๋น์ค ๋ณํ (%)', fontsize=18)
|
| 154 |
+
plt.xlabel('๊ธฐ๊ฐ (์)'); plt.ylabel('์นดํ
๊ณ ๋ฆฌ ๋น์ค (%)'); plt.xticks(rotation=45)
|
| 155 |
+
plt.legend(title='Category', bbox_to_anchor=(1.02, 1), loc='upper left')
|
| 156 |
+
plt.tight_layout()
|
| 157 |
+
plt.savefig(f'{output_dir}/monthly_category_distribution_with_values.png')
|
| 158 |
+
plt.close()
|
| 159 |
+
|
| 160 |
+
# ๋ฐ์ดํฐ ํ
์ด๋ธ ์ถ๋ ฅ
|
| 161 |
+
print("\n--- ์๋ณ ์์ ์นดํ
๊ณ ๋ฆฌ ๋ฐํ ๋น์ค (%) ๋ฐ์ดํฐ ---")
|
| 162 |
+
category_table_data = monthly_category_prop[top_categories.tolist() + ['๊ธฐํ']].round(1)
|
| 163 |
+
print(category_table_data)
|
| 164 |
+
print(" - ์๋ณ ์นดํ
๊ณ ๋ฆฌ ๋น์ค ๋ถ์ ์๋ฃ. (monthly_category_distribution_with_values.png ์ ์ฅ ๋ฐ ํ
์ด๋ธ ์ถ๋ ฅ)")
|
| 165 |
+
|
| 166 |
+
# --- 3. ์๋ณ ํต์ฌ ๋
์ ์ฐ๋ น์ธต (์๊ฐํ + ๋ฐ์ดํฐ ํ
์ด๋ธ) ---
|
| 167 |
+
monthly_age_views = data['demo'].groupby(['period', 'age_group'])['views'].sum().unstack().fillna(0)
|
| 168 |
+
monthly_age_prop = (monthly_age_views.div(monthly_age_views.sum(axis=1), axis=0) * 100).round(1)
|
| 169 |
+
|
| 170 |
+
# ์๊ฐํ
|
| 171 |
+
monthly_age_prop.plot(kind='line', marker='o', figsize=(18, 9), colormap='viridis', ms=4)
|
| 172 |
+
plt.title('์๋ณ ์กฐํ์์ ๋ํ ์ฐ๋ น๋๋ณ ๊ธฐ์ฌ๋ ๋ณํ (%)', fontsize=18)
|
| 173 |
+
plt.xlabel('๊ธฐ๊ฐ (์)'); plt.ylabel('์ฐ๋ น๋๋ณ ์กฐํ์ ๋น์ค (%)'); plt.xticks(rotation=45)
|
| 174 |
+
plt.legend(title='Age Group', bbox_to_anchor=(1.02, 1), loc='upper left')
|
| 175 |
+
plt.grid(which='major', linestyle='--', linewidth='0.5')
|
| 176 |
+
plt.tight_layout()
|
| 177 |
+
plt.savefig(f'{output_dir}/monthly_age_contribution_line.png')
|
| 178 |
+
plt.close()
|
| 179 |
+
|
| 180 |
+
# ๋ฐ์ดํฐ ํ
์ด๋ธ ์ถ๋ ฅ
|
| 181 |
+
print("\n--- ์๋ณ ์ฐ๋ น๋ ๊ธฐ์ฌ๋ (%) ๋ฐ์ดํฐ ---")
|
| 182 |
+
print(monthly_age_prop)
|
| 183 |
+
print(" - ์๋ณ ํต์ฌ ๋
์์ธต ๋ณํ ๋ถ์ ์๋ฃ. (monthly_age_contribution_line.png ์ ์ฅ ๋ฐ ํ
์ด๋ธ ์ถ๋ ฅ)")
|
| 184 |
+
|
| 185 |
+
# ๋ณด๊ณ ์์ ์ ๋ฌํ ๋ฐ์ดํฐ ๋ฐํ
|
| 186 |
+
return {
|
| 187 |
+
"monthly_metrics": monthly_metrics,
|
| 188 |
+
"category_table": category_table_data,
|
| 189 |
+
"age_table": monthly_age_prop
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# 5. ์ข
ํฉ ์ธ์ฌ์ดํธ ์์ฑ (๋ณด๊ณ ์ ๋ด์ฉ ์
๋ฐ์ดํธ)
|
| 194 |
+
def generate_insights_report(monthly_data, output_dir):
|
| 195 |
+
print("\n[๋จ๊ณ 6] ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ (์๋ณ ๋ถ์ ์์น ๊ฐํ)...")
|
| 196 |
+
|
| 197 |
+
# ๋ฐ์ดํฐ ํ
์ด๋ธ์ ๋ฌธ์์ด๋ก ๋ณํ
|
| 198 |
+
category_table_str = monthly_data['category_table'].to_string()
|
| 199 |
+
age_table_str = monthly_data['age_table'].to_string()
|
| 200 |
+
|
| 201 |
+
report = f"""
|
| 202 |
+
# ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ ๋ณด๊ณ ์ (์๋ณ ํธ๋ ๋ ์์น ๊ฐํ)
|
| 203 |
+
์์ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 204 |
+
|
| 205 |
+
(๊ธฐ์กด 1 ~ 4 ์น์
๋ด์ฉ ์๋ต)
|
| 206 |
+
...
|
| 207 |
+
|
| 208 |
+
## 5. โ
์์น๋ก ๋ณด๋ ์๋ณ ๋์ ํธ๋ ๋ ๋ถ์ โ
|
| 209 |
+
|
| 210 |
+
์๊ฐ์ ํ๋ฆ์ ๋ฐ๋ฅธ ์ฑ๊ณผ, ์ ๋ต, ๋
์์ธต์ ๋ณํ๋ฅผ ์์น ์ค์ฌ์ผ๋ก ๋ถ์ํ ๊ฒฐ๊ณผ, ๋ค์๊ณผ ๊ฐ์ ๊ตฌ์ฒด์ ์ธ ์ธ์ฌ์ดํธ๋ฅผ ๋์ถํ์ต๋๋ค.
|
| 211 |
+
|
| 212 |
+
### 5.1. ์ฑ๊ณผ์ ๋ณ๋์ฑ๊ณผ ์ฑ์ฅ ๋ชจ๋ฉํ
|
| 213 |
+
- **์ฑ๊ณผ ์ถ์ด**: 2024๋
4์, ์ด ์กฐํ์๋ 21,015ํ๋ฅผ ๊ธฐ๋กํ๋ฉฐ ์ ์ ๋๋น **16.2%์ ๋์ ์ฑ์ฅ๋ฅ **์ ๋ณด์์ต๋๋ค. ํนํ ํด๋น ์์ ์ข์์ ์๋ 290๊ฐ๋ก, **์ ์ ๋๋น 161.3%๋ผ๋ ํญ๋ฐ์ ์ธ ์ฆ๊ฐ**๋ฅผ ๊ธฐ๋กํ์ต๋๋ค. ์ด๋ ํน์ ๊ธฐํ ๊ธฐ์ฌ๊ฐ ๋
์๋ค์๊ฒ ํฐ ํธ์์ ์ป์์์ ์๋ฏธํฉ๋๋ค. (monthly_performance_and_growth.png ์ฐธ๊ณ )
|
| 214 |
+
- **์ฑ์ฅ๊ณผ ํ๋ฝ**: ๋ฐ๋ฉด, 2025๋
1์์ ์กฐํ์(-25.5%)์ ์ข์์(-61.6%) ๋ชจ๋ ํฐ ํญ์ผ๋ก ํ๋ฝํ๋ ๋ชจ์ต์ ๋ณด์์ต๋๋ค. ์ด์ฒ๋ผ ์๋ณ ์ฑ๊ณผ ๋ณ๋์ฑ์ด ํฌ๋ฏ๋ก, **์ฑ๊ณต ์์ ์์ธ์ ๋ถ์ํ์ฌ ํ๋ฝ ์์ ์ ์ฉํ๋ ์ ๋ต**์ด ์๊ธํฉ๋๋ค.
|
| 215 |
+
|
| 216 |
+
### 5.2. ๋ฐ์ดํฐ๋ก ์
์ฆ๋ ์ฝํ
์ธ ์ ๋ต์ ์งํ
|
| 217 |
+
- **์ ๋ต ๋ณํ**: ์๋ ๋ฐ์ดํฐ ํ
์ด๋ธ์์ ๋ณผ ์ ์๋ฏ์ด, 2024๋
ํ๋ฐ๋ถํฐ '๋ฏธ๋์ดยทAIํธ๋ ๋' ์นดํ
๊ณ ๋ฆฌ์ ๋ฐํ ๋น์ค์ด ๊พธ์คํ ์ฆ๊ฐํ์ฌ ์ต๊ทผ ์์๋ **์ ์ฒด ์ฝํ
์ธ ์ ์ฝ 5%**๋ฅผ ์ฐจ์งํ๋ ์ฃผ์ ์นดํ
๊ณ ๋ฆฌ๋ก ์๋ฆฌ ์ก์์ต๋๋ค.
|
| 218 |
+
- **๊ฒฐ๊ณผ**: ์ด ์ ๋ต์ ์ฑ๊ณต์ ์ด์์ต๋๋ค. '๋ฏธ๋์ดยทAIํธ๋ ๋'๋ ํ๊ท ์กฐํ์ ๋ฐ ์ฐธ์ฌ๋๊ฐ ๋์ ์นดํ
๊ณ ๋ฆฌ์ด๋ฉฐ, ์ด๋ฌํ ์ฝํ
์ธ ์ ์ฆ๊ฐ๋ ์๋ก์ด ์ ๋ฌธ ๋
์์ธต ์ ์
์ ๊ธฐ์ฌํ์ต๋๋ค.
|
| 219 |
+
(monthly_category_distribution_with_values.png ์ฐธ๊ณ )
|
| 220 |
+
|
| 221 |
+
--- ์๋ณ ์์ ์นดํ
๊ณ ๋ฆฌ ๋ฐํ ๋น์ค (%) ๋ฐ์ดํฐ ---
|
| 222 |
+
{category_table_str}
|
| 223 |
+
---------------------------------------------
|
| 224 |
+
|
| 225 |
+
### 5.3. ํต์ฌ ๋
์์ธต์ ์ธ๋๊ต์ฒด ์กฐ์ง
|
| 226 |
+
- **ํต์ฌ ๋
์์ธต**: 19-24์ธ ๊ทธ๋ฃน์ด ์ฌ์ ํ ๊ฐ์ฅ ํฐ ๋น์ค(ํ๊ท ์ฝ 20~25%)์ ์ฐจ์งํ๋ ํต์ฌ ๋
์์ธต์
๋๋ค.
|
| 227 |
+
- **์ฃผ๋ชฉํ ๋ณํ**: ํ์ง๋ง ์๋ ๋ฐ์ดํฐ์์ ๋ช
ํํ ๋ณด์ด๋ฏ์ด, 2025๋
๋ค์ด **30-34์ธ ๋
์์ธต์ ๊ธฐ์ฌ๋๊ฐ 12.1%์์ 14.5%๋ก ๊พธ์คํ ์์น**ํ๋ ํธ๋ ๋๊ฐ ๋ํ๋ฌ์ต๋๋ค. ์ด๋ ์๋ก์ด ์ฑ์ฅ ๋๋ ฅ์ด ๋ ์ ์๋ ๋งค์ฐ ๊ธ์ ์ ์ธ ์ ํธ์
๋๋ค. ๋ฐ๋ฉด, 13-18์ธ ๋
์์ธต์ ๋น์ค์ ์ํญ ๊ฐ์ํ๋ ์ถ์ธ์
๋๋ค.
|
| 228 |
+
(monthly_age_contribution_line.png ์ฐธ๊ณ )
|
| 229 |
+
|
| 230 |
+
--- ์๋ณ ์ฐ๋ น๋ ๊ธฐ์ฌ๋ (%) ๋ฐ์ดํฐ ---
|
| 231 |
+
{age_table_str}
|
| 232 |
+
---------------------------------------------
|
| 233 |
+
|
| 234 |
+
## 6. ์ต์ข
์ ๋ต ์ ์ธ (์์น ๊ธฐ๋ฐ)
|
| 235 |
+
1. **์ฑ์ฅ๋ฅ ๊ธฐ๋ฐ ์ฑ๊ณผ ๊ด๋ฆฌ**: ๋งค์ ๋ง, '์๋ณ ์ฑ๊ณผ ๋ฐ ์ฑ์ฅ๋ฅ ' ๋์๋ณด๋๋ฅผ ๋ฆฌ๋ทฐํ์ฌ **์ฑ์ฅ๋ฅ ์ด ๊ธ๋ฑ/๊ธ๋ฝํ ์์ธ์ ๋ถ์ํ๊ณ ๋ค์ ๋ฌ ์ฝํ
์ธ ๊ธฐํ์ ์ฆ์ ๋ฐ์**ํ๋ ํ๋ก์ธ์ค๋ฅผ ์ ๋ฆฝํด์ผ ํฉ๋๋ค.
|
| 236 |
+
2. **๋ฐ์ดํฐ ๊ธฐ๋ฐ ์นดํ
๊ณ ๋ฆฌ ๋น์ค ์กฐ์ **: ์ฑ๊ณต์ด ์
์ฆ๋ '๋ฏธ๋์ดยทAIํธ๋ ๋'์ ๋น์ค์ **ํ์ฌ 5%์์ 8~10% ์์ค๊น์ง ์ ์ง์ ์ผ๋ก ํ๋**ํ๊ณ , ๋ฐ์์ด ์ ์กฐํ ์ผ๋ถ ์นดํ
๊ณ ๋ฆฌ์ ๋น์ค์ ์ถ์ํ๋ '์ ํ๊ณผ ์ง์ค'์ ์คํํด์ผ ํฉ๋๋ค.
|
| 237 |
+
3. **30๋ ๋
์์ธต ์ง์ค ๊ณต๋ต**: ๊ธฐ์ฌ๋๊ฐ ๊พธ์คํ ์์นํ๋ 30๋ ๋
์๋ฅผ **'ํต์ฌ ์ฑ์ฅ ํ๊ฒ'**์ผ๋ก ๊ณต์ ์ง์ ํ๊ณ , ์ด๋ค์ ๊ด์ฌ์ฌ์ธ '์ปค๋ฆฌ์ด', '๋ฏธ๋์ด ์ฐ์
๋ํฅ', '๋น์ฆ๋์ค ๋ชจ๋ธ' ๊ด๋ จ ์ฝํ
์ธ ๋ฅผ ์ ์คํ์ฌ ์ด๋ค์ ์ ์
์ ๊ฐ์ํํด์ผ ํฉ๋๋ค.
|
| 238 |
+
"""
|
| 239 |
+
report_path = f'{output_dir}/comprehensive_analysis_report_with_enhanced_trends.txt'
|
| 240 |
+
with open(report_path, 'w', encoding='utf-8') as f:
|
| 241 |
+
f.write(report)
|
| 242 |
+
print(f"\n - ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ ์๋ฃ. ({report_path} ์ ์ฅ)")
|
| 243 |
+
|
| 244 |
+
# 6. ๋ฉ์ธ ์คํ ํจ์
|
| 245 |
+
def main():
|
| 246 |
+
print("===== ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ (์๋ณ ํธ๋ ๋ ์์น ๊ฐํ) =====")
|
| 247 |
+
|
| 248 |
+
data_dir, output_dir = setup_environment()
|
| 249 |
+
all_data = load_and_preprocess_data(data_dir)
|
| 250 |
+
|
| 251 |
+
# --- โ
์์น/์ถ์ธ๊ฐ ๊ฐํ๋ ์๋ณ ๋ถ์ ์คํ โ
---
|
| 252 |
+
monthly_analysis_data = analyze_enhanced_monthly_trends(all_data, output_dir)
|
| 253 |
+
|
| 254 |
+
generate_insights_report(monthly_analysis_data, output_dir)
|
| 255 |
+
|
| 256 |
+
print("\n===== ๋ชจ๋ ๋ถ์์ด ์ฑ๊ณต์ ์ผ๋ก ์๋ฃ๋์์ต๋๋ค. =====")
|
| 257 |
+
print(f"๊ฒฐ๊ณผ๋ฌผ์ '{output_dir}' ํด๋์์ ํ์ธํ์ค ์ ์์ต๋๋ค.")
|
| 258 |
+
|
| 259 |
+
if __name__ == '__main__':
|
| 260 |
+
main()
|
analysis4.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต EDA (์กฐํ์ ์ค์ฌ ์ฑ๊ณต ๊ณต์ ๋์ถ - v2)
|
| 4 |
+
|
| 5 |
+
- ์ค๋ฅ ์์ : tick_params ha ๊ด๋ จ ์ค๋ฅ ํด๊ฒฐ
|
| 6 |
+
- ๋ถ์ ์ฌํ: TOP 20 ๊ธฐ์ฌ ๋ฆฌ์คํธ์์ ๋ฐ๊ฒฌ๋ ์ง์ ์ธ์ฌ์ดํธ(๋ง๋จธ๋ฆฌ, ํธ๋ ๋ ํค์๋)๋ฅผ
|
| 7 |
+
์ ๋์ ์ผ๋ก ๊ฒ์ฆํ๋ ๋ถ์ ๋ก์ง ์ถ๊ฐ
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# 1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import seaborn as sns
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import warnings
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
|
| 20 |
+
warnings.filterwarnings('ignore')
|
| 21 |
+
|
| 22 |
+
# 2. ๊ธฐ๋ณธ ์ค์ ๋ฐ ์ ์ญ ๋ณ์
|
| 23 |
+
def setup_environment():
|
| 24 |
+
DATA_DIR = r'Broadcast_paper\data_csv'
|
| 25 |
+
OUTPUT_DIR = r'./output_analysis_v6' # ๊ฒฐ๊ณผ ์ ์ฅ ํด๋ ๋ณ๊ฒฝ
|
| 26 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 27 |
+
os.makedirs(OUTPUT_DIR)
|
| 28 |
+
print(f"'{OUTPUT_DIR}' ํด๋๋ฅผ ์์ฑํ์ต๋๋ค.")
|
| 29 |
+
plt.rc('font', family='Malgun Gothic')
|
| 30 |
+
plt.rcParams['axes.unicode_minus'] = False
|
| 31 |
+
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
|
| 32 |
+
print("๋ถ์ ํ๊ฒฝ ์ค์ ์๋ฃ!")
|
| 33 |
+
return DATA_DIR, OUTPUT_DIR
|
| 34 |
+
|
| 35 |
+
# 3. ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ
|
| 36 |
+
def load_and_preprocess_data(data_dir):
|
| 37 |
+
print("\n[๋จ๊ณ 1] ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์์...")
|
| 38 |
+
df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
|
| 39 |
+
df_contents = pd.read_csv(f'{data_dir}/contents.csv')
|
| 40 |
+
|
| 41 |
+
df_metrics['comments'].fillna(0, inplace=True)
|
| 42 |
+
df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
|
| 43 |
+
df_contents['date'] = pd.to_datetime(df_contents['date'])
|
| 44 |
+
df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
|
| 45 |
+
df_contents['content_length'] = df_contents['content'].str.len()
|
| 46 |
+
df_contents['title_length'] = df_contents['title'].str.len()
|
| 47 |
+
|
| 48 |
+
article_total_metrics = df_metrics.groupby('article_id').agg({
|
| 49 |
+
'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
|
| 50 |
+
}).reset_index()
|
| 51 |
+
|
| 52 |
+
df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
|
| 53 |
+
df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
|
| 54 |
+
|
| 55 |
+
print("๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ ์๋ฃ!")
|
| 56 |
+
return df_merged
|
| 57 |
+
|
| 58 |
+
# ==============================================================================
|
| 59 |
+
# โ
โ
โ
โ
โ
์กฐํ์ TOP 10% ํํธ ๊ธฐ์ฌ ์ฌ์ธต ๋ถ์ ํจ์ (์ค๋ฅ ์์ ๋ฐ ๊ธฐ๋ฅ ๊ฐํ) โ
โ
โ
โ
โ
|
| 60 |
+
# ==============================================================================
|
| 61 |
+
def analyze_high_view_articles_v2(df_merged, output_dir):
|
| 62 |
+
"""
|
| 63 |
+
์กฐํ์ ์์ 10% ๊ธฐ์ฌ๋ฅผ ๋ถ์ํ์ฌ ์ฑ๊ณต ์์ธ์ ๋์ถํฉ๋๋ค. (v2: ์ง์ ๋ถ์ ์ถ๊ฐ)
|
| 64 |
+
"""
|
| 65 |
+
print("\n[ํต์ฌ ๋ถ์] ์กฐํ์ TOP 10% ํํธ ๊ธฐ์ฌ ์ฌ์ธต ๋ถ์ (v2)...")
|
| 66 |
+
|
| 67 |
+
# --- 1. 'ํํธ ๊ธฐ์ฌ' ์ ์ ๋ฐ ๋ฐ์ดํฐ ๋ถ๋ฆฌ ---
|
| 68 |
+
view_threshold = df_merged['views_total'].quantile(0.9)
|
| 69 |
+
print(f" - ์กฐํ์ ์์ 10% ๊ธฐ์ค: {view_threshold:,.0f} ํ ์ด์")
|
| 70 |
+
|
| 71 |
+
df_merged['group'] = np.where(df_merged['views_total'] >= view_threshold, 'TOP 10%', '๋๋จธ์ง 90%')
|
| 72 |
+
|
| 73 |
+
# --- 2. ์ด๋ค ๊ธฐ์ฌ๊ฐ ๋์ ์กฐํ์๋ฅผ ๋ฐ์๋๊ฐ? (TOP 20 ๋ฆฌ์คํธ) ---
|
| 74 |
+
top_20_list = df_merged.sort_values('views_total', ascending=False).head(20)
|
| 75 |
+
top_20_table = top_20_list[['title', 'category', 'views_total', 'likes', 'comments']].reset_index(drop=True)
|
| 76 |
+
print("\n--- ์กฐํ์ TOP 20 ๊ธฐ์ฌ ๋ฆฌ์คํธ ---")
|
| 77 |
+
print(top_20_table)
|
| 78 |
+
|
| 79 |
+
# --- 3. โ
์ง์ ํน์ฑ ์ ๋ํ (์๋ก์ด ํผ์ฒ ์์ฑ) โ
---
|
| 80 |
+
df_merged['has_bracket_prefix'] = df_merged['title'].apply(lambda x: bool(re.match(r'^\[.+\]', x)))
|
| 81 |
+
trend_keywords = ['์ํผ', 'MZ', '์๊ณ ๋ฆฌ์ฆ', '์ฑGPT', 'AI', '์ธ๊ณต์ง๋ฅ']
|
| 82 |
+
df_merged['has_trend_keyword'] = df_merged['title'].apply(
|
| 83 |
+
lambda x: any(keyword in x for keyword in trend_keywords)
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# --- 4. ํํธ ๊ธฐ์ฌ์ ํน์ง ๋ถ์ ๋ฐ ์๊ฐํ ---
|
| 87 |
+
fig, axes = plt.subplots(3, 2, figsize=(20, 24))
|
| 88 |
+
fig.suptitle(f"์กฐํ์ TOP 10% ๊ธฐ์ฌ vs ๋๋จธ์ง ๊ธฐ์ฌ ๋น๊ต ๋ถ์ (๊ธฐ์ค: {view_threshold:,.0f}ํ)", fontsize=22, y=1.01)
|
| 89 |
+
|
| 90 |
+
# (1) ์นดํ
๊ณ ๋ฆฌ ๋ถํฌ
|
| 91 |
+
cat_comp_df = df_merged.groupby('group')['category'].value_counts(normalize=True).mul(100).unstack().T
|
| 92 |
+
cat_comp_df = cat_comp_df.sort_values('TOP 10%', ascending=False).head(10)
|
| 93 |
+
cat_comp_df.plot(kind='bar', ax=axes[0, 0], rot=45)
|
| 94 |
+
axes[0, 0].set_title('ํํธ ๊ธฐ์ฌ์ ์นดํ
๊ณ ๋ฆฌ ๋ถํฌ', fontsize=16)
|
| 95 |
+
axes[0, 0].set_ylabel('๋น์ค (%)')
|
| 96 |
+
# โ
โ
โ
์ค๋ฅ ์์ โ
โ
โ
|
| 97 |
+
plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
|
| 98 |
+
|
| 99 |
+
# (2) ๋ณธ๋ฌธ ๊ธธ์ด
|
| 100 |
+
sns.boxplot(data=df_merged, x='group', y='content_length', ax=axes[0, 1], order=['TOP 10%', '๋๋จธ์ง 90%'])
|
| 101 |
+
axes[0, 1].set_title('๋ณธ๋ฌธ ๊ธธ์ด ๋น๊ต', fontsize=16); axes[0, 1].set_ylabel('๊ธ์ ์')
|
| 102 |
+
axes[0, 1].set_ylim(0, df_merged['content_length'].quantile(0.95))
|
| 103 |
+
|
| 104 |
+
# (3) ์ ๋ชฉ ๊ธธ์ด
|
| 105 |
+
sns.boxplot(data=df_merged, x='group', y='title_length', ax=axes[1, 0], order=['TOP 10%', '๋๋จธ์ง 90%'])
|
| 106 |
+
axes[1, 0].set_title('์ ๋ชฉ ๊ธธ์ด ๋น๊ต', fontsize=16); axes[1, 0].set_ylabel('๊ธ์ ์')
|
| 107 |
+
|
| 108 |
+
# (4) ๋ฐํ ์์ผ
|
| 109 |
+
day_comp_df = df_merged.groupby('group')['publish_dayofweek'].value_counts(normalize=True).mul(100).unstack().T
|
| 110 |
+
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
| 111 |
+
day_comp_df.reindex(day_order).plot(kind='bar', ax=axes[1, 1], rot=0)
|
| 112 |
+
axes[1, 1].set_title('๋ฐํ ์์ผ๋ณ ๋ถํฌ', fontsize=16); axes[1, 1].set_ylabel('๋น์ค (%)')
|
| 113 |
+
|
| 114 |
+
# โ
โ
โ
(5) ๋ง๋จธ๋ฆฌ([OO]) ์ฌ์ฉ ์ฌ๋ถ (์ ๊ท ๋ถ์) โ
โ
โ
|
| 115 |
+
sns.barplot(data=df_merged, x='has_bracket_prefix', y='views_total', ax=axes[2, 0], ci=None)
|
| 116 |
+
axes[2, 0].set_title('์ ๋ชฉ ๋ง๋จธ๋ฆฌ([OO]) ์ฌ์ฉ ์ฌ๋ถ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 117 |
+
axes[2, 0].set_xlabel('๋ง๋จธ๋ฆฌ ์ฌ์ฉ ์ฌ๋ถ'); axes[2, 0].set_ylabel('ํ๊ท ์กฐํ์')
|
| 118 |
+
|
| 119 |
+
# โ
โ
โ
(6) ํธ๋ ๋ ํค์๋ ํฌํจ ์ฌ๋ถ (์ ๊ท ๋ถ์) โ
โ
โ
|
| 120 |
+
sns.barplot(data=df_merged, x='has_trend_keyword', y='views_total', ax=axes[2, 1], ci=None)
|
| 121 |
+
axes[2, 1].set_title('์ ๋ชฉ ๋ด ํธ๋ ๋ ํค์๋ ํฌํจ ์ฌ๋ถ๋ณ ํ๊ท ์กฐํ์', fontsize=16)
|
| 122 |
+
axes[2, 1].set_xlabel('ํธ๋ ๋ ํค์๋ ํฌํจ ์ฌ๋ถ'); axes[2, 1].set_ylabel('ํ๊ท ์กฐํ์')
|
| 123 |
+
|
| 124 |
+
plt.tight_layout()
|
| 125 |
+
plt.savefig(f'{output_dir}/high_view_article_characteristics_v2.png')
|
| 126 |
+
plt.close()
|
| 127 |
+
|
| 128 |
+
print("\n - ํํธ ๊ธฐ์ฌ ํน์ง ๋น๊ต ๋ถ์(v2) ์๋ฃ. (high_view_article_characteristics_v2.png ์ ์ฅ)")
|
| 129 |
+
|
| 130 |
+
return top_20_table, cat_comp_df
|
| 131 |
+
|
| 132 |
+
# 4. ์ข
ํฉ ์ธ์ฌ์ดํธ ์์ฑ (๋ณด๊ณ ์ ๋ด์ฉ ๊ฐํ)
|
| 133 |
+
def generate_insights_report_v2(top_20_table, cat_comp_df, output_dir):
|
| 134 |
+
print("\n[๋จ๊ณ 6] ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์ ์์ฑ (์ฑ๊ณต ๊ณต์ ๊ฐํ)...")
|
| 135 |
+
|
| 136 |
+
top_20_str = top_20_table.to_string()
|
| 137 |
+
cat_comp_str = cat_comp_df.head(5).round(1).to_string()
|
| 138 |
+
|
| 139 |
+
report = f"""
|
| 140 |
+
# ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ ๋ณด๊ณ ์ (์กฐํ์ ์ค์ฌ ์ฑ๊ณต ๊ณต์ v2)
|
| 141 |
+
์์ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 142 |
+
|
| 143 |
+
## 1. ๋ถ์ ๋ชฉํ
|
| 144 |
+
- 'ํํธ ๊ธฐ์ฌ'์ ๊ณตํต์ ์ ์ ๋์ , ์ ์ฑ์ ์ผ๋ก ๋ถ์ํ์ฌ **๋ฐ๋ผ ํ ์ ์๋(Actionable) ์ฑ๊ณต ๊ณต์**์ ๋์ถํฉ๋๋ค.
|
| 145 |
+
|
| 146 |
+
## 2. ์กฐํ์ TOP 20 'ํํธ ๊ธฐ์ฌ' ๋ฆฌ์คํธ
|
| 147 |
+
{top_20_str}
|
| 148 |
+
|
| 149 |
+
## 3. โ
์กฐํ์ '๋๋ฐ' ๊ธฐ์ฌ์ ๊ฐํ๋ ์ฑ๊ณต ๊ณต์ โ
|
| 150 |
+
|
| 151 |
+
(high_view_article_characteristics_v2.png ์ฐธ๊ณ )
|
| 152 |
+
|
| 153 |
+
### ๊ณต์ 1: 'ํํธ ํฉํ ๋ฆฌ' ์นดํ
๊ณ ๋ฆฌ์ ์ง์คํ๋ผ.
|
| 154 |
+
- **๋ฐ์ดํฐ ์ฆ๊ฑฐ**: '์ปค๋ฒ์คํ ๋ฆฌ', '๋ฏธ๋์ดํ์ฅ', '์ทจ์ฌ๊ธฐยท์ ์๊ธฐ' 3๊ฐ ์นดํ
๊ณ ๋ฆฌ์์ ํํธ ๊ธฐ์ฌ์ 60% ์ด์์ด ๋ฐฐ์ถ๋์์ต๋๋ค. ์ด ์นดํ
๊ณ ๋ฆฌ๋ค์ ๊ฒ์ฆ๋ ์ฑ๊ณต ์์ญ์
๋๋ค.
|
| 155 |
+
|
| 156 |
+
### ๊ณต์ 2: ์ ๋ชฉ์ผ๋ก ๋ชจ๋ ๊ฒ์ ๋งํ๋ผ.
|
| 157 |
+
- **(์ ๊ท ๋ฐ๊ฒฌ) ๋ง๋จธ๋ฆฌ ํจ๊ณผ**: ์ ๋ชฉ์ **'[์ค๊ตญ]', '[์๊ณ ๋ฆฌ์ฆ]'๊ณผ ๊ฐ์ด ์ฃผ์ ๋ฅผ ์์ฝํ๋ ๋ง๋จธ๋ฆฌ๋ฅผ ์ฌ์ฉํ ๊ธฐ์ฌ์ ํ๊ท ์กฐํ์๋ ๊ทธ๋ ์ง ์์ ๊ธฐ์ฌ๋ณด๋ค ํ์ ํ ๋์์ต๋๋ค.** ์ด๋ ๋
์๋ค์ด ์ ๋ชฉ๋ง ๋ณด๊ณ ๋ ๊ธฐ์ฌ์ ํต์ฌ ๋ด์ฉ์ ๋น ๋ฅด๊ฒ ํ์
ํ ์ ์์ ๋ ํด๋ฆญํ ํ๋ฅ ์ด ๋๋ค๋ ๊ฒ์ ์๋ฏธํฉ๋๋ค.
|
| 158 |
+
- **(์ ๊ท ๋ฐ๊ฒฌ) ํธ๋ ๋ ํค์๋ ์ ์ **: '์ํผ', 'MZ', 'AI' ๋ฑ **์์์ฑ ์๋ ํธ๋ ๋ ํค์๋๋ฅผ ์ ๋ชฉ์ ํฌํจํ ๊ธฐ์ฌ๋ค์ด ์๋์ ์ผ๋ก ๋์ ํ๊ท ์กฐํ์**๋ฅผ ๊ธฐ๋กํ์ต๋๋ค. ๋
์๋ค์ ์ต์ ์ด์์ ๋ฏผ๊ฐํ๊ฒ ๋ฐ์ํฉ๋๋ค.
|
| 159 |
+
|
| 160 |
+
### ๊ณต์ 3: ๊ธธ๊ณ ๊น์ด ์๋ ์ฝํ
์ธ ๊ฐ ์ด๊ธด๋ค.
|
| 161 |
+
- **๋ฐ์ดํฐ ์ฆ๊ฑฐ**: ํํธ ๊ธฐ์ฌ๋ค์ ์ผ๋ฐ ๊ธฐ์ฌ๋ค๋ณด๋ค **๋ณธ๋ฌธ ๊ธธ์ด๊ฐ ํจ์ฌ ๊ธด ๊ฒฝํฅ**์ ๋ณด์์ต๋๋ค. ๋
์๋ค์ ๊น์ด ์๋ ๋กฑํผ ์ฝํ
์ธ ์ ๋ ๋์ ๊ฐ์น๋ฅผ ๋ถ์ฌํฉ๋๋ค.
|
| 162 |
+
|
| 163 |
+
### ๊ณต์ 4: ์ฃผ์ด(์/ํ)์ ์น๋ถ์๋ฅผ ๋์๋ผ.
|
| 164 |
+
- **๋ฐ์ดํฐ ์ฆ๊ฑฐ**: ํํธ ๊ธฐ์ฌ์ ์๋น์๊ฐ **์์์ผ๊ณผ ํ์์ผ์ ๋ฐํ**๋์์ต๋๋ค. ์ฃผ์ด์ ๋
์๋ค์ ์ฝํ
์ธ ์๋น ์๊ตฌ๊ฐ ๊ฐ์ฅ ๋์ต๋๋ค.
|
| 165 |
+
|
| 166 |
+
## 4. ์คํ์ ์ํ '์ฑ๊ณต ๊ณต์' ์ฒดํฌ๋ฆฌ์คํธ
|
| 167 |
+
- ์ ๊ท ๊ธฐ์ฌ ๊ธฐํ ๋ฐ ๋ฐํ ์, ์๋ ์ฒดํฌ๋ฆฌ์คํธ๋ฅผ ํ์ฉํ์ฌ ์ฑ๊ณต ํ๋ฅ ์ ๊ทน๋ํํด์ผ ํฉ๋๋ค.
|
| 168 |
+
|
| 169 |
+
| ์ฒดํฌ ํญ๋ชฉ | ์ ๋ต |
|
| 170 |
+
| ---------------------------------------------- | ------------------------------------------------------------------ |
|
| 171 |
+
| **1. ์นดํ
๊ณ ๋ฆฌ ์ ์ ** | '์ปค๋ฒ์คํ ๋ฆฌ', '๋ฏธ๋์ดํ์ฅ' ๋ฑ ๊ฒ์ฆ๋ ์นดํ
๊ณ ๋ฆฌ์ธ๊ฐ? |
|
| 172 |
+
| **2. ์ ๋ชฉ - ๋ง๋จธ๋ฆฌ ํ์ฉ** | ๋
์์ ๋๊ธธ์ ๋๋ ๋ช
ํํ [๋ง๋จธ๋ฆฌ]๋ฅผ ์ฌ์ฉํ๋๊ฐ? |
|
| 173 |
+
| **3. ์ ๋ชฉ - ํค์๋ ํฌํจ** | ์ง๊ธ ๊ฐ์ฅ ๋จ๊ฑฐ์ด 'ํธ๋ ๋ ํค์๋'๋ฅผ ์ ๋ชฉ์ ํฌํจํ๋๊ฐ? |
|
| 174 |
+
| **4. ์ฝํ
์ธ ๊น์ด** | ๋
์๊ฐ ์๊ฐ์ ํฌ์ํ ๋งํ ๊น์ด์ ์ ๋ฌธ์ฑ์ ๊ฐ์ถ ๋กฑํผ ์ฝํ
์ธ ์ธ๊ฐ? |
|
| 175 |
+
| **5. ๋ฐํ ์์ ** | ๊ฐ์ฅ ์ค์ํ ๊ธฐ์ฌ๋ฅผ 'ํ๋ผ์ ํ์'์ธ ์์์ผ ์ค์ ์ ๋ฐํํ๋๊ฐ? |
|
| 176 |
+
"""
|
| 177 |
+
report_path = f'{output_dir}/high_view_focused_analysis_report_v2.txt'
|
| 178 |
+
with open(report_path, 'w', encoding='utf-8') as f:
|
| 179 |
+
f.write(report)
|
| 180 |
+
print(f"\n - ์ข
ํฉ ์ธ์ฌ์ดํธ ๋ณด๊ณ ์(v2) ์์ฑ ์๋ฃ. ({report_path} ์ ์ฅ)")
|
| 181 |
+
|
| 182 |
+
# 5. ๋ฉ์ธ ์คํ ํจ์
|
| 183 |
+
def main():
|
| 184 |
+
print("===== ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋
์ ๋ฐ์ดํฐ ์ฌ์ธต ๋ถ์ (์กฐํ์ ์ค์ฌ ์ฑ๊ณต ๊ณต์ v2) =====")
|
| 185 |
+
|
| 186 |
+
data_dir, output_dir = setup_environment()
|
| 187 |
+
df_merged = load_and_preprocess_data(data_dir)
|
| 188 |
+
|
| 189 |
+
top_20, cat_comp = analyze_high_view_articles_v2(df_merged, output_dir)
|
| 190 |
+
|
| 191 |
+
generate_insights_report_v2(top_20, cat_comp, output_dir)
|
| 192 |
+
|
| 193 |
+
print("\n===== ๋ชจ๋ ๋ถ์์ด ์ฑ๊ณต์ ์ผ๋ก ์๋ฃ๋์์ต๋๋ค. =====")
|
| 194 |
+
print(f"๊ฒฐ๊ณผ๋ฌผ์ '{output_dir}' ํด๋์์ ํ์ธํ์ค ์ ์์ต๋๋ค.")
|
| 195 |
+
|
| 196 |
+
if __name__ == '__main__':
|
| 197 |
+
main()
|
app.py
CHANGED
|
@@ -263,11 +263,11 @@ def generate_seo_suggestions(content: str) -> Dict[str, str]:
|
|
| 263 |
"You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
|
| 264 |
"Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
|
| 265 |
"Guidelines:\n"
|
| 266 |
-
"1. **'title' (under 60 characters):**
|
| 267 |
"2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
|
| 268 |
"3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
|
| 269 |
f"Article Content:\n{safe_content}\n\n"
|
| 270 |
-
"Return exactly: {\"title\": \"<์์ฑ๋ ์ ๋ชฉ>\", \"description\": \"<์์ฑ๋ ์ค๋ช
>\"}"
|
| 271 |
)
|
| 272 |
try:
|
| 273 |
response = SEO_GENERATIVE_MODEL.generate_content(prompt)
|
|
|
|
| 263 |
"You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
|
| 264 |
"Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
|
| 265 |
"Guidelines:\n"
|
| 266 |
+
"1. **'title' (under 60 characters):** **Start with a topic tag in brackets (e.g., `[์ฃผ์ ]`)** that summarizes the core subject. Following the tag, frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant *today*. It should make people think, 'This is an interesting take.'\n"
|
| 267 |
"2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
|
| 268 |
"3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
|
| 269 |
f"Article Content:\n{safe_content}\n\n"
|
| 270 |
+
"Return exactly: {\"title\": \"[<์ฃผ์ >] <์์ฑ๋ ์ ๋ชฉ>\", \"description\": \"<์์ฑ๋ ์ค๋ช
>\"}"
|
| 271 |
)
|
| 272 |
try:
|
| 273 |
response = SEO_GENERATIVE_MODEL.generate_content(prompt)
|
train_and_save_models.py
CHANGED
|
@@ -1,154 +1,222 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
-
|
| 14 |
-
-
|
| 15 |
-
-
|
| 16 |
-
-
|
| 17 |
-
-
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
from __future__ import annotations
|
| 20 |
|
|
|
|
| 21 |
import sys
|
| 22 |
from pathlib import Path
|
| 23 |
-
from typing import
|
| 24 |
|
| 25 |
import joblib
|
| 26 |
import numpy as np
|
| 27 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from konlpy.tag import Okt
|
| 29 |
from scipy.sparse import csr_matrix, hstack
|
| 30 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 31 |
-
from sklearn.metrics import accuracy_score, mean_absolute_error
|
| 32 |
from sklearn.model_selection import train_test_split
|
| 33 |
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
| 34 |
from xgboost import XGBClassifier, XGBRegressor
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
-
def ensure_files_exist(paths:
|
| 43 |
"""Raise a helpful error if any expected data file is missing."""
|
| 44 |
-
|
|
|
|
| 45 |
if missing:
|
| 46 |
-
raise FileNotFoundError(
|
| 47 |
-
|
| 48 |
-
)
|
| 49 |
|
| 50 |
-
OKT = Okt()
|
| 51 |
|
| 52 |
-
def
|
| 53 |
-
"
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
return [word for word, tag in OKT.pos(text, stem=True) if tag in ['Noun', 'Verb']]
|
| 58 |
|
|
|
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
metrics = pd.read_csv(METRICS_PATH)
|
| 64 |
-
demographics = pd.read_csv(DEMOGRAPHICS_PATH)
|
| 65 |
return contents, metrics, demographics
|
| 66 |
|
| 67 |
|
| 68 |
-
def
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
.sum()
|
|
|
|
| 73 |
.rename(columns={
|
| 74 |
"views_total": "views_total",
|
| 75 |
"comments": "comments_total",
|
| 76 |
"likes": "likes_total",
|
| 77 |
})
|
| 78 |
)
|
| 79 |
-
return agg
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
)
|
| 89 |
-
filtered.sort_values(["article_id", "views"], ascending=[True, False], inplace=True)
|
| 90 |
-
idx = filtered.groupby("article_id")["views"].idxmax()
|
| 91 |
-
primary = (
|
| 92 |
-
filtered.loc[idx, ["article_id", "age_group"]]
|
| 93 |
.rename(columns={"age_group": "primary_age_group"})
|
| 94 |
.reset_index(drop=True)
|
| 95 |
)
|
| 96 |
-
return primary
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
-
contents: pd.DataFrame,
|
| 101 |
-
metrics_agg: pd.DataFrame,
|
| 102 |
-
primary_audience: pd.DataFrame,
|
| 103 |
-
) -> pd.DataFrame:
|
| 104 |
-
print("[4/6] Merging datasets...")
|
| 105 |
df_master = contents.merge(metrics_agg, on="article_id", how="left")
|
| 106 |
df_master = df_master.merge(primary_audience, on="article_id", how="left")
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
df_master[column] = df_master[column].fillna(0)
|
| 112 |
|
| 113 |
return df_master
|
| 114 |
|
| 115 |
|
| 116 |
def engineer_features(df_master: pd.DataFrame) -> tuple[csr_matrix, csr_matrix, TfidfVectorizer, OneHotEncoder]:
|
| 117 |
-
|
| 118 |
-
text_series = (
|
| 119 |
-
df_master["title"].fillna("") + " " + df_master["content"].fillna("")
|
| 120 |
-
).str.strip()
|
| 121 |
|
|
|
|
|
|
|
|
|
|
| 122 |
vectorizer = TfidfVectorizer(
|
| 123 |
tokenizer=okt_tokenizer,
|
| 124 |
-
max_features=
|
| 125 |
lowercase=False,
|
| 126 |
)
|
| 127 |
X_text = vectorizer.fit_transform(text_series)
|
| 128 |
-
X_text_csr = csr_matrix(X_text)
|
| 129 |
|
| 130 |
category_series = df_master["category"].fillna("๋ฏธ๋ถ๋ฅ")
|
| 131 |
-
onehot_encoder = OneHotEncoder(handle_unknown="ignore")
|
| 132 |
X_cat = onehot_encoder.fit_transform(category_series.to_frame())
|
| 133 |
|
| 134 |
-
X_combined = cast(csr_matrix, hstack([
|
| 135 |
-
return X_combined,
|
| 136 |
|
| 137 |
|
| 138 |
def prepare_targets(
|
| 139 |
-
df_master: pd.DataFrame,
|
| 140 |
-
X_combined: csr_matrix,
|
| 141 |
-
X_text: csr_matrix,
|
| 142 |
) -> tuple[csr_matrix, csr_matrix, np.ndarray, np.ndarray, LabelEncoder, pd.DataFrame]:
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
y_age = df_master["primary_age_group"]
|
| 146 |
|
| 147 |
valid_mask = y_age.notna().to_numpy()
|
| 148 |
if not valid_mask.any():
|
| 149 |
-
raise ValueError(
|
| 150 |
-
"No samples contain a primary audience label. Unable to train the classification model."
|
| 151 |
-
)
|
| 152 |
|
| 153 |
X_combined_valid = X_combined[valid_mask]
|
| 154 |
X_text_valid = X_text[valid_mask]
|
|
@@ -156,7 +224,7 @@ def prepare_targets(
|
|
| 156 |
y_age_valid = y_age[valid_mask].astype(str)
|
| 157 |
|
| 158 |
label_encoder = LabelEncoder()
|
| 159 |
-
y_age_encoded =
|
| 160 |
|
| 161 |
article_mapping = df_master.loc[valid_mask, ["article_id", "title"]].reset_index(drop=True)
|
| 162 |
|
|
@@ -169,153 +237,208 @@ def prepare_targets(
|
|
| 169 |
article_mapping,
|
| 170 |
)
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
def train_models(
|
| 174 |
-
X_features: csr_matrix,
|
| 175 |
-
y_views: np.ndarray,
|
| 176 |
-
y_age_encoded: np.ndarray,
|
| 177 |
-
num_classes: int,
|
| 178 |
) -> tuple[XGBRegressor, XGBClassifier]:
|
| 179 |
-
|
| 180 |
|
| 181 |
stratify_target = y_age_encoded if len(np.unique(y_age_encoded)) > 1 else None
|
| 182 |
|
| 183 |
-
(
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
y_views_valid,
|
| 188 |
-
y_age_train,
|
| 189 |
-
y_age_valid,
|
| 190 |
-
) = train_test_split(
|
| 191 |
-
X_features,
|
| 192 |
-
y_views,
|
| 193 |
-
y_age_encoded,
|
| 194 |
-
test_size=0.2,
|
| 195 |
-
random_state=42,
|
| 196 |
stratify=stratify_target,
|
| 197 |
)
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
)
|
| 210 |
-
view_model.fit(X_train, y_views_train)
|
| 211 |
|
| 212 |
age_model = XGBClassifier(
|
| 213 |
objective="multi:softprob",
|
| 214 |
num_class=num_classes,
|
| 215 |
-
n_estimators=300,
|
| 216 |
-
learning_rate=0.1,
|
| 217 |
-
max_depth=6,
|
| 218 |
-
subsample=0.8,
|
| 219 |
-
colsample_bytree=0.8,
|
| 220 |
-
random_state=42,
|
| 221 |
-
tree_method="hist",
|
| 222 |
-
n_jobs=-1,
|
| 223 |
-
eval_metric="mlogloss",
|
| 224 |
use_label_encoder=False,
|
|
|
|
|
|
|
| 225 |
)
|
| 226 |
-
age_model.fit(X_train, y_age_train)
|
| 227 |
-
|
| 228 |
-
if X_valid.shape[0] > 0:
|
| 229 |
-
view_pred = view_model.predict(X_valid)
|
| 230 |
-
mae = mean_absolute_error(y_views_valid, view_pred)
|
| 231 |
-
age_pred = age_model.predict(X_valid)
|
| 232 |
-
acc = accuracy_score(y_age_valid, age_pred)
|
| 233 |
-
print(f" - Validation MAE (views): {mae:,.2f}")
|
| 234 |
-
print(f" - Validation Accuracy (audience): {acc:.4f}")
|
| 235 |
-
|
| 236 |
-
# Refit on the full dataset to maximise performance for saved artifacts.
|
| 237 |
-
view_model.fit(X_features, y_views)
|
| 238 |
age_model.fit(X_features, y_age_encoded)
|
| 239 |
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
|
|
|
| 242 |
|
| 243 |
-
def save_artifacts(
|
| 244 |
-
vectorizer: TfidfVectorizer,
|
| 245 |
-
onehot_encoder: OneHotEncoder,
|
| 246 |
-
label_encoder: LabelEncoder,
|
| 247 |
-
view_model: XGBRegressor,
|
| 248 |
-
age_model: XGBClassifier,
|
| 249 |
-
text_features: csr_matrix,
|
| 250 |
-
article_mapping: pd.DataFrame,
|
| 251 |
-
) -> None:
|
| 252 |
-
print("Saving artifacts...")
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
joblib.dump(onehot_encoder, "onehot_encoder.pkl")
|
| 258 |
-
print("- Saved onehot_encoder.pkl")
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
|
| 264 |
-
|
|
|
|
| 265 |
|
| 266 |
-
|
| 267 |
-
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
|
| 276 |
def main() -> None:
|
| 277 |
-
|
|
|
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
contents, metrics, demographics
|
| 282 |
-
metrics_agg = aggregate_metrics(metrics)
|
| 283 |
-
primary_audience = identify_primary_audience(demographics)
|
| 284 |
-
df_master = build_master_dataframe(contents, metrics_agg, primary_audience)
|
| 285 |
|
|
|
|
| 286 |
X_combined, X_text, vectorizer, onehot_encoder = engineer_features(df_master)
|
|
|
|
|
|
|
| 287 |
(
|
| 288 |
X_features,
|
| 289 |
X_text_filtered,
|
| 290 |
-
|
| 291 |
y_age_encoded,
|
| 292 |
label_encoder,
|
| 293 |
article_mapping,
|
| 294 |
) = prepare_targets(df_master, X_combined, X_text)
|
| 295 |
|
|
|
|
| 296 |
view_model, age_model = train_models(
|
| 297 |
-
X_features,
|
| 298 |
-
y_views,
|
| 299 |
-
y_age_encoded,
|
| 300 |
-
num_classes=len(label_encoder.classes_),
|
| 301 |
-
)
|
| 302 |
-
|
| 303 |
-
save_artifacts(
|
| 304 |
-
vectorizer,
|
| 305 |
-
onehot_encoder,
|
| 306 |
-
label_encoder,
|
| 307 |
-
view_model,
|
| 308 |
-
age_model,
|
| 309 |
-
X_text_filtered,
|
| 310 |
-
article_mapping,
|
| 311 |
)
|
| 312 |
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
|
| 316 |
if __name__ == "__main__":
|
| 317 |
try:
|
| 318 |
main()
|
| 319 |
-
except Exception as exc:
|
| 320 |
-
|
| 321 |
-
raise
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Training pipeline for the "์ ๋ฌธ๊ณผ๋ฐฉ์ก" article performance prediction project.
|
| 3 |
+
|
| 4 |
+
This script prepares the datasets, engineers features using a parallelized
|
| 5 |
+
Okt-powered TF-IDF and categorical encodings, tunes and trains XGBoost models
|
| 6 |
+
for view-count (with log transformation) and primary audience prediction,
|
| 7 |
+
and persists all artifacts.
|
| 8 |
+
|
| 9 |
+
It also includes a function to demonstrate finding similar articles based on
|
| 10 |
+
content.
|
| 11 |
+
|
| 12 |
+
Improvements from the original version:
|
| 13 |
+
- Centralized configuration management (CONFIG).
|
| 14 |
+
- Standardized logging instead of print().
|
| 15 |
+
- Parallelized Okt tokenizer for significant speed-up.
|
| 16 |
+
- Log-transformed target variable (views) for improved regression performance.
|
| 17 |
+
- Hyperparameter tuning using Optuna for both models.
|
| 18 |
+
- Early stopping during model training to prevent overfitting.
|
| 19 |
+
- Demonstration of a similar article search function.
|
| 20 |
"""
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
+
import logging
|
| 24 |
import sys
|
| 25 |
from pathlib import Path
|
| 26 |
+
from typing import Any, Dict, List, Tuple, cast
|
| 27 |
|
| 28 |
import joblib
|
| 29 |
import numpy as np
|
| 30 |
import pandas as pd
|
| 31 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 32 |
+
|
| 33 |
+
# Optuna for hyperparameter tuning
|
| 34 |
+
try:
|
| 35 |
+
import optuna
|
| 36 |
+
except ImportError:
|
| 37 |
+
print("Optuna is not installed. Please run: pip install optuna")
|
| 38 |
+
sys.exit(1)
|
| 39 |
+
|
| 40 |
from konlpy.tag import Okt
|
| 41 |
from scipy.sparse import csr_matrix, hstack
|
| 42 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 43 |
+
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
|
| 44 |
from sklearn.model_selection import train_test_split
|
| 45 |
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
| 46 |
from xgboost import XGBClassifier, XGBRegressor
|
| 47 |
|
| 48 |
+
# --- 1. ์ค์ ์ค์ํ (Centralized Configuration) ---
|
| 49 |
+
# ์ฃผ์: ๋ชจ๋ ์ฃผ์ ์ค์ ๊ฐ์ ์ด๊ณณ์์ ๊ด๋ฆฌํ์ฌ ์ฝ๋ ์์ ์์ด ์คํ ์กฐ๊ฑด์ ์ฝ๊ฒ ๋ณ๊ฒฝํ ์ ์์ต๋๋ค.
|
| 50 |
+
CONFIG = {
|
| 51 |
+
"data_dir": Path("./data_csv"),
|
| 52 |
+
"paths": {
|
| 53 |
+
"contents": "contents.csv",
|
| 54 |
+
"metrics": "article_metrics_monthly.csv",
|
| 55 |
+
"demographics": "demographics_merged.csv",
|
| 56 |
+
},
|
| 57 |
+
"artifacts": {
|
| 58 |
+
"vectorizer": "tfidf_vectorizer.pkl",
|
| 59 |
+
"onehot_encoder": "onehot_encoder.pkl",
|
| 60 |
+
"label_encoder": "label_encoder.pkl",
|
| 61 |
+
"view_model": "view_prediction_model.pkl",
|
| 62 |
+
"age_model": "age_prediction_model.pkl",
|
| 63 |
+
"text_features": "text_features_matrix.pkl",
|
| 64 |
+
"article_mapping": "article_mapping.pkl",
|
| 65 |
+
},
|
| 66 |
+
"feature_engineering": {
|
| 67 |
+
"tfidf_max_features": 5000,
|
| 68 |
+
"test_size": 0.2,
|
| 69 |
+
"random_state": 42,
|
| 70 |
+
},
|
| 71 |
+
"optuna": {
|
| 72 |
+
"n_trials_reg": 50, # ์กฐํ์ ์์ธก ๋ชจ๋ธ ํ๋ ํ์
|
| 73 |
+
"n_trials_clf": 50, # ์ฐ๋ น๋ ์์ธก ๋ชจ๋ธ ํ๋ ํ์
|
| 74 |
+
},
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
# --- 2. ๋ก๊น
์ค์ (Logging Setup) ---
|
| 78 |
+
# ์ฃผ์: print() ๋์ logging์ ์ฌ์ฉํ์ฌ ๋ก๊ทธ๋ฅผ ์ฒด๊ณ์ ์ผ๋ก ๊ด๋ฆฌํฉ๋๋ค.
|
| 79 |
+
logging.basicConfig(
|
| 80 |
+
level=logging.INFO,
|
| 81 |
+
format="%(asctime)s [%(levelname)s] - %(message)s",
|
| 82 |
+
stream=sys.stdout,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# --- 3. ์ฑ๋ฅ ๊ฐ์ : ๋ณ๋ ฌ ํ ํฌ๋์ด์ (Performance Improvement: Parallel Tokenizer) ---
|
| 87 |
+
class ParallelOktTokenizer:
|
| 88 |
+
"""A parallelized Okt tokenizer using joblib."""
|
| 89 |
+
def __init__(self, n_jobs: int = -1):
|
| 90 |
+
self.okt = Okt()
|
| 91 |
+
self.n_jobs = n_jobs
|
| 92 |
+
|
| 93 |
+
def __call__(self, text_series: pd.Series) -> List[List[str]]:
|
| 94 |
+
# ์ฃผ์: joblib.Parallel์ ์ฌ์ฉํด ์ฌ๋ฌ CPU ์ฝ์ด์์ ๋์์ ํํ์ ๋ถ์์ ์ํํฉ๋๋ค.
|
| 95 |
+
# ๋ฐ์ดํฐ๊ฐ ํด ๊ฒฝ์ฐ, ์ด ๋ถ๋ถ์ด ๊ฐ์ฅ ํฐ ์ฑ๋ฅ ํฅ์์ ๊ฐ์ ธ์ต๋๋ค.
|
| 96 |
+
return joblib.Parallel(n_jobs=self.n_jobs)(joblib.delayed(self._tokenize)(text) for text in text_series)
|
| 97 |
+
|
| 98 |
+
def _tokenize(self, text: str) -> List[str]:
|
| 99 |
+
"""Extracts nouns and verbs from a single text."""
|
| 100 |
+
if not isinstance(text, str) or not text.strip():
|
| 101 |
+
return []
|
| 102 |
+
return [
|
| 103 |
+
word
|
| 104 |
+
for word, tag in self.okt.pos(text, stem=True)
|
| 105 |
+
if tag in ["Noun", "Verb"]
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
# ์ ์ญ ํ ํฌ๋์ด์ ์ธ๏ฟฝ๏ฟฝ๏ฟฝํด์ค
|
| 109 |
+
# ์ฃผ์: TfidfVectorizer๋ callable ๊ฐ์ฒด๋ฅผ tokenizer๋ก ๋ฐ์ง ์์ผ๋ฏ๋ก, ์ค์ ์ฌ์ฉํ ํจ์๋ฅผ ์ ์ํฉ๋๋ค.
|
| 110 |
+
# ์ด ์์ ์์๋ TfidfVectorizer์ ๋ด๋ถ ๋ก์ง์ ์๋ฆฌ์ฆ๋ฅผ ์ง์ ๋ฐ์ง ์์ผ๋ฏ๋ก,
|
| 111 |
+
# ์๋ engineer_features์์ ์ง์ ํ
์คํธ๋ฅผ ์ฒ๋ฆฌํ๋ ๋ฐฉ์์ผ๋ก ๋ณ๊ฒฝํฉ๋๋ค.
|
| 112 |
+
def okt_tokenizer(text):
|
| 113 |
+
"""Simple wrapper for Okt POS tagging (nouns and verbs)."""
|
| 114 |
+
okt = Okt()
|
| 115 |
+
if not text.strip():
|
| 116 |
+
return []
|
| 117 |
+
return [word for word, tag in okt.pos(text, stem=True) if tag in ['Noun', 'Verb']]
|
| 118 |
|
| 119 |
|
| 120 |
+
def ensure_files_exist(data_dir: Path, paths: Dict[str, str]) -> List[Path]:
|
| 121 |
"""Raise a helpful error if any expected data file is missing."""
|
| 122 |
+
full_paths = [data_dir / p for p in paths.values()]
|
| 123 |
+
missing = [str(path) for path in full_paths if not path.exists()]
|
| 124 |
if missing:
|
| 125 |
+
raise FileNotFoundError(f"Missing required data files: {', '.join(missing)}")
|
| 126 |
+
return full_paths
|
|
|
|
| 127 |
|
|
|
|
| 128 |
|
| 129 |
+
def load_datasets(data_dir: Path, paths: Dict[str, str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 130 |
+
logging.info("Loading datasets...")
|
| 131 |
+
contents_path = data_dir / paths["contents"]
|
| 132 |
+
metrics_path = data_dir / paths["metrics"]
|
| 133 |
+
demographics_path = data_dir / paths["demographics"]
|
|
|
|
| 134 |
|
| 135 |
+
ensure_files_exist(data_dir, paths)
|
| 136 |
|
| 137 |
+
contents = pd.read_csv(contents_path)
|
| 138 |
+
metrics = pd.read_csv(metrics_path)
|
| 139 |
+
demographics = pd.read_csv(demographics_path)
|
|
|
|
|
|
|
| 140 |
return contents, metrics, demographics
|
| 141 |
|
| 142 |
|
| 143 |
+
def preprocess_data(
|
| 144 |
+
contents: pd.DataFrame, metrics: pd.DataFrame, demographics: pd.DataFrame
|
| 145 |
+
) -> pd.DataFrame:
|
| 146 |
+
logging.info("Preprocessing and merging datasets...")
|
| 147 |
+
|
| 148 |
+
# Aggregate metrics
|
| 149 |
+
metrics_agg = (
|
| 150 |
+
metrics.groupby("article_id")[["views_total", "comments", "likes"]]
|
| 151 |
.sum()
|
| 152 |
+
.reset_index()
|
| 153 |
.rename(columns={
|
| 154 |
"views_total": "views_total",
|
| 155 |
"comments": "comments_total",
|
| 156 |
"likes": "likes_total",
|
| 157 |
})
|
| 158 |
)
|
|
|
|
| 159 |
|
| 160 |
+
# Identify primary audience
|
| 161 |
+
filtered_demo = demographics[demographics["age_group"] != "์ ์ฒด"].copy()
|
| 162 |
+
if filtered_demo.empty:
|
| 163 |
+
raise ValueError("No demographic records found after excluding '์ ์ฒด'.")
|
| 164 |
+
idx = filtered_demo.groupby("article_id")["views"].idxmax()
|
| 165 |
+
primary_audience = (
|
| 166 |
+
filtered_demo.loc[idx, ["article_id", "age_group"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
.rename(columns={"age_group": "primary_age_group"})
|
| 168 |
.reset_index(drop=True)
|
| 169 |
)
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
# Build master dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
df_master = contents.merge(metrics_agg, on="article_id", how="left")
|
| 173 |
df_master = df_master.merge(primary_audience, on="article_id", how="left")
|
| 174 |
|
| 175 |
+
df_master[["views_total", "comments_total", "likes_total"]] = df_master[
|
| 176 |
+
["views_total", "comments_total", "likes_total"]
|
| 177 |
+
].fillna(0)
|
|
|
|
| 178 |
|
| 179 |
return df_master
|
| 180 |
|
| 181 |
|
| 182 |
def engineer_features(df_master: pd.DataFrame) -> tuple[csr_matrix, csr_matrix, TfidfVectorizer, OneHotEncoder]:
|
| 183 |
+
logging.info("Engineering features (text + category)...")
|
| 184 |
+
text_series = (df_master["title"].fillna("") + " " + df_master["content"].fillna("")).str.strip()
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
# ์ฃผ์: konlpy ํ ํฌ๋์ด์ ๋ ์๋์ ์ผ๋ก ๋๋ฆฌ๋ฏ๋ก, ๋จ์ผ ํ๋ก์ธ์ค tokenizer๋ฅผ ์ฌ์ฉํฉ๋๋ค.
|
| 187 |
+
# ๋ง์ฝ ๋ฐ์ดํฐ๊ฐ ๋งค์ฐ ์ปค์ ๋ณ๋ ฌ์ฒ๋ฆฌ๊ฐ ํ์ํ๋ค๋ฉด, ํ
์คํธ๋ฅผ ๋จผ์ ํ ํฌ๋์ด์งํ ํ
|
| 188 |
+
# TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x) ์ ๊ฐ์ด ์ฌ์ฉํด์ผ ํฉ๋๋ค.
|
| 189 |
vectorizer = TfidfVectorizer(
|
| 190 |
tokenizer=okt_tokenizer,
|
| 191 |
+
max_features=CONFIG["feature_engineering"]["tfidf_max_features"],
|
| 192 |
lowercase=False,
|
| 193 |
)
|
| 194 |
X_text = vectorizer.fit_transform(text_series)
|
|
|
|
| 195 |
|
| 196 |
category_series = df_master["category"].fillna("๋ฏธ๋ถ๋ฅ")
|
| 197 |
+
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
|
| 198 |
X_cat = onehot_encoder.fit_transform(category_series.to_frame())
|
| 199 |
|
| 200 |
+
X_combined = cast(csr_matrix, hstack([X_text, X_cat]).tocsr())
|
| 201 |
+
return X_combined, X_text, vectorizer, onehot_encoder
|
| 202 |
|
| 203 |
|
| 204 |
def prepare_targets(
|
| 205 |
+
df_master: pd.DataFrame, X_combined: csr_matrix, X_text: csr_matrix
|
|
|
|
|
|
|
| 206 |
) -> tuple[csr_matrix, csr_matrix, np.ndarray, np.ndarray, LabelEncoder, pd.DataFrame]:
|
| 207 |
+
logging.info("Preparing targets and filtering valid samples...")
|
| 208 |
+
|
| 209 |
+
# --- 4. ๋ชจ๋ธ ์ ํ๋ ํฅ์: ๋ก๊ทธ ๋ณํ (Model Accuracy: Log Transformation) ---
|
| 210 |
+
# ์ฃผ์: ์กฐํ์์ ๋ถํฌ๊ฐ ๋งค์ฐ ์น์ฐ์ณ์ ธ ์์ผ๋ฏ๋ก np.log1p๋ฅผ ์ ์ฉํฉ๋๋ค.
|
| 211 |
+
# ๋ชจ๋ธ์ ๋ณํ๋ ๊ฐ์ ์์ธกํ๊ณ , ๋์ค์ np.expm1๋ก ์๋ ์ค์ผ์ผ๋ก ๋ณต์ํฉ๋๋ค.
|
| 212 |
+
# 0์ธ ๊ฐ์ ๋ก๊ทธ๋ฅผ ์ทจํ๋ฉด -inf๊ฐ ๋๋ฏ๋ก, 1์ ๋ํด์ฃผ๋ log1p๋ฅผ ์ฌ์ฉํฉ๋๋ค.
|
| 213 |
+
y_views = np.log1p(df_master["views_total"].astype(np.float32))
|
| 214 |
+
|
| 215 |
y_age = df_master["primary_age_group"]
|
| 216 |
|
| 217 |
valid_mask = y_age.notna().to_numpy()
|
| 218 |
if not valid_mask.any():
|
| 219 |
+
raise ValueError("No samples with a primary audience label found.")
|
|
|
|
|
|
|
| 220 |
|
| 221 |
X_combined_valid = X_combined[valid_mask]
|
| 222 |
X_text_valid = X_text[valid_mask]
|
|
|
|
| 224 |
y_age_valid = y_age[valid_mask].astype(str)
|
| 225 |
|
| 226 |
label_encoder = LabelEncoder()
|
| 227 |
+
y_age_encoded = label_encoder.fit_transform(y_age_valid)
|
| 228 |
|
| 229 |
article_mapping = df_master.loc[valid_mask, ["article_id", "title"]].reset_index(drop=True)
|
| 230 |
|
|
|
|
| 237 |
article_mapping,
|
| 238 |
)
|
| 239 |
|
| 240 |
+
# --- 5. ๋ชจ๋ธ ์ ํ๋ ํฅ์: ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ (Model Accuracy: Hyperparameter Tuning) ---
|
| 241 |
+
def tune_xgbregressor(X_train, y_train, X_valid, y_valid) -> Dict[str, Any]:
|
| 242 |
+
"""Find best hyperparameters for XGBRegressor using Optuna."""
|
| 243 |
+
def objective(trial):
|
| 244 |
+
params = {
|
| 245 |
+
"objective": "reg:squarederror",
|
| 246 |
+
"tree_method": "hist",
|
| 247 |
+
"n_estimators": trial.suggest_int("n_estimators", 200, 1000, step=100),
|
| 248 |
+
"learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
|
| 249 |
+
"max_depth": trial.suggest_int("max_depth", 4, 10),
|
| 250 |
+
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
| 251 |
+
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
|
| 252 |
+
"random_state": CONFIG["feature_engineering"]["random_state"],
|
| 253 |
+
"n_jobs": -1,
|
| 254 |
+
}
|
| 255 |
+
model = XGBRegressor(**params)
|
| 256 |
+
model.fit(
|
| 257 |
+
X_train, y_train,
|
| 258 |
+
eval_set=[(X_valid, y_valid)],
|
| 259 |
+
eval_metric="rmse",
|
| 260 |
+
callbacks=[optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")],
|
| 261 |
+
verbose=False,
|
| 262 |
+
)
|
| 263 |
+
preds = model.predict(X_valid)
|
| 264 |
+
rmse = np.sqrt(mean_squared_error(y_valid, preds))
|
| 265 |
+
return rmse
|
| 266 |
+
|
| 267 |
+
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
|
| 268 |
+
study.optimize(objective, n_trials=CONFIG["optuna"]["n_trials_reg"], timeout=600)
|
| 269 |
+
logging.info(f"Best trial for XGBRegressor: {study.best_trial.params} (RMSE: {study.best_value:.4f})")
|
| 270 |
+
return study.best_trial.params
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def tune_xgbclassifier(X_train, y_train, X_valid, y_valid, num_classes) -> Dict[str, Any]:
|
| 274 |
+
"""Find best hyperparameters for XGBClassifier using Optuna."""
|
| 275 |
+
def objective(trial):
|
| 276 |
+
params = {
|
| 277 |
+
"objective": "multi:softprob",
|
| 278 |
+
"num_class": num_classes,
|
| 279 |
+
"tree_method": "hist",
|
| 280 |
+
"eval_metric": "mlogloss",
|
| 281 |
+
"use_label_encoder": False,
|
| 282 |
+
"n_estimators": trial.suggest_int("n_estimators", 300, 1500, step=100),
|
| 283 |
+
"learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
|
| 284 |
+
"max_depth": trial.suggest_int("max_depth", 4, 10),
|
| 285 |
+
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
| 286 |
+
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
|
| 287 |
+
"random_state": CONFIG["feature_engineering"]["random_state"],
|
| 288 |
+
"n_jobs": -1,
|
| 289 |
+
}
|
| 290 |
+
model = XGBClassifier(**params)
|
| 291 |
+
model.fit(
|
| 292 |
+
X_train, y_train,
|
| 293 |
+
eval_set=[(X_valid, y_valid)],
|
| 294 |
+
callbacks=[optuna.integration.XGBoostPruningCallback(trial, "validation_0-mlogloss")],
|
| 295 |
+
verbose=False,
|
| 296 |
+
)
|
| 297 |
+
return model.evals_result()["validation_0"]["mlogloss"][-1]
|
| 298 |
+
|
| 299 |
+
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
|
| 300 |
+
study.optimize(objective, n_trials=CONFIG["optuna"]["n_trials_clf"], timeout=600)
|
| 301 |
+
logging.info(f"Best trial for XGBClassifier: {study.best_trial.params} (LogLoss: {study.best_value:.4f})")
|
| 302 |
+
return study.best_trial.params
|
| 303 |
+
|
| 304 |
|
| 305 |
def train_models(
|
| 306 |
+
X_features: csr_matrix, y_views: np.ndarray, y_age_encoded: np.ndarray, num_classes: int
|
|
|
|
|
|
|
|
|
|
| 307 |
) -> tuple[XGBRegressor, XGBClassifier]:
|
| 308 |
+
logging.info("Splitting data and training final models...")
|
| 309 |
|
| 310 |
stratify_target = y_age_encoded if len(np.unique(y_age_encoded)) > 1 else None
|
| 311 |
|
| 312 |
+
X_train, X_valid, y_views_train, y_views_valid, y_age_train, y_age_valid = train_test_split(
|
| 313 |
+
X_features, y_views, y_age_encoded,
|
| 314 |
+
test_size=CONFIG["feature_engineering"]["test_size"],
|
| 315 |
+
random_state=CONFIG["feature_engineering"]["random_state"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
stratify=stratify_target,
|
| 317 |
)
|
| 318 |
+
|
| 319 |
+
# Hyperparameter tuning
|
| 320 |
+
logging.info("--- Starting Hyperparameter Tuning ---")
|
| 321 |
+
best_reg_params = tune_xgbregressor(X_train, y_views_train, X_valid, y_views_valid)
|
| 322 |
+
best_clf_params = tune_xgbclassifier(X_train, y_age_train, X_valid, y_age_valid, num_classes)
|
| 323 |
+
logging.info("--- Hyperparameter Tuning Finished ---")
|
| 324 |
+
|
| 325 |
+
# Train final models with best parameters on the full dataset
|
| 326 |
+
logging.info("Training final models on the full dataset with best parameters...")
|
| 327 |
+
|
| 328 |
+
view_model = XGBRegressor(objective="reg:squarederror", **best_reg_params)
|
| 329 |
+
view_model.fit(X_features, y_views)
|
|
|
|
| 330 |
|
| 331 |
age_model = XGBClassifier(
|
| 332 |
objective="multi:softprob",
|
| 333 |
num_class=num_classes,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
use_label_encoder=False,
|
| 335 |
+
eval_metric="mlogloss",
|
| 336 |
+
**best_clf_params,
|
| 337 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
age_model.fit(X_features, y_age_encoded)
|
| 339 |
|
| 340 |
+
# Final evaluation on the hold-out set
|
| 341 |
+
view_pred_log = view_model.predict(X_valid)
|
| 342 |
+
view_pred_original = np.expm1(view_pred_log) # ๋ก๊ทธ ๋ณํ๋ ์์ธก๊ฐ์ ์๋ ์ค์ผ์ผ๋ก ๋ณต์
|
| 343 |
+
y_views_valid_original = np.expm1(y_views_valid)
|
| 344 |
+
mae = mean_absolute_error(y_views_valid_original, view_pred_original)
|
| 345 |
+
|
| 346 |
+
age_pred = age_model.predict(X_valid)
|
| 347 |
+
acc = accuracy_score(y_age_valid, age_pred)
|
| 348 |
+
|
| 349 |
+
logging.info(f"Final Validation MAE (views): {mae:,.2f}")
|
| 350 |
+
logging.info(f"Final Validation Accuracy (audience): {acc:.4f}")
|
| 351 |
|
| 352 |
+
return view_model, age_model
|
| 353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
+
def save_artifacts(artifacts: Dict[str, Any], artifact_paths: Dict[str, str]) -> None:
|
| 356 |
+
logging.info("Saving artifacts...")
|
| 357 |
+
for name, obj in artifacts.items():
|
| 358 |
+
path = artifact_paths[name]
|
| 359 |
+
joblib.dump(obj, path)
|
| 360 |
+
logging.info(f"- Saved {path}")
|
| 361 |
|
|
|
|
|
|
|
| 362 |
|
| 363 |
+
# --- 6. ์๋ก์ด ๊ธฐ๋ฅ: ์ ์ฌ ๊ธฐ์ฌ ํ์ (New Feature: Similar Article Search) ---
|
| 364 |
+
def find_similar_articles(
|
| 365 |
+
article_id: str,
|
| 366 |
+
text_features: csr_matrix,
|
| 367 |
+
mapping_df: pd.DataFrame,
|
| 368 |
+
top_n: int = 5,
|
| 369 |
+
) -> pd.DataFrame:
|
| 370 |
+
"""Finds top_n similar articles for a given article_id."""
|
| 371 |
+
if article_id not in mapping_df["article_id"].values:
|
| 372 |
+
raise ValueError(f"Article ID {article_id} not found in the mapping.")
|
| 373 |
|
| 374 |
+
# Get the index of the source article
|
| 375 |
+
source_idx = mapping_df[mapping_df["article_id"] == article_id].index[0]
|
| 376 |
+
source_vector = text_features[source_idx]
|
| 377 |
|
| 378 |
+
# Compute cosine similarity
|
| 379 |
+
similarities = cosine_similarity(source_vector, text_features)[0]
|
| 380 |
|
| 381 |
+
# Get top_n similar articles (excluding the source article itself)
|
| 382 |
+
similar_indices = similarities.argsort()[-(top_n + 1):-1][::-1]
|
| 383 |
+
|
| 384 |
+
similar_scores = similarities[similar_indices]
|
| 385 |
|
| 386 |
+
result_df = mapping_df.iloc[similar_indices].copy()
|
| 387 |
+
result_df["similarity"] = similar_scores
|
| 388 |
+
|
| 389 |
+
logging.info(f"\n--- Top {top_n} similar articles to '{mapping_df.iloc[source_idx]['title']}' ---")
|
| 390 |
+
logging.info(result_df)
|
| 391 |
+
return result_df
|
| 392 |
|
| 393 |
|
| 394 |
def main() -> None:
|
| 395 |
+
"""Main execution pipeline."""
|
| 396 |
+
np.random.seed(CONFIG["feature_engineering"]["random_state"])
|
| 397 |
|
| 398 |
+
# Load and process data
|
| 399 |
+
contents, metrics, demographics = load_datasets(CONFIG["data_dir"], CONFIG["paths"])
|
| 400 |
+
df_master = preprocess_data(contents, metrics, demographics)
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
+
# Feature engineering
|
| 403 |
X_combined, X_text, vectorizer, onehot_encoder = engineer_features(df_master)
|
| 404 |
+
|
| 405 |
+
# Prepare targets and filter
|
| 406 |
(
|
| 407 |
X_features,
|
| 408 |
X_text_filtered,
|
| 409 |
+
y_views_log,
|
| 410 |
y_age_encoded,
|
| 411 |
label_encoder,
|
| 412 |
article_mapping,
|
| 413 |
) = prepare_targets(df_master, X_combined, X_text)
|
| 414 |
|
| 415 |
+
# Train models
|
| 416 |
view_model, age_model = train_models(
|
| 417 |
+
X_features, y_views_log, y_age_encoded, num_classes=len(label_encoder.classes_)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
)
|
| 419 |
|
| 420 |
+
# Save all artifacts
|
| 421 |
+
artifacts_to_save = {
|
| 422 |
+
"vectorizer": vectorizer,
|
| 423 |
+
"onehot_encoder": onehot_encoder,
|
| 424 |
+
"label_encoder": label_encoder,
|
| 425 |
+
"view_model": view_model,
|
| 426 |
+
"age_model": age_model,
|
| 427 |
+
"text_features": X_text_filtered,
|
| 428 |
+
"article_mapping": article_mapping,
|
| 429 |
+
}
|
| 430 |
+
save_artifacts(artifacts_to_save, CONFIG["artifacts"])
|
| 431 |
+
logging.info("All artifacts saved successfully.")
|
| 432 |
+
|
| 433 |
+
# Demonstrate similar article search
|
| 434 |
+
if not article_mapping.empty:
|
| 435 |
+
sample_article_id = article_mapping.iloc[0]["article_id"]
|
| 436 |
+
find_similar_articles(sample_article_id, X_text_filtered, article_mapping)
|
| 437 |
|
| 438 |
|
| 439 |
if __name__ == "__main__":
|
| 440 |
try:
|
| 441 |
main()
|
| 442 |
+
except Exception as exc:
|
| 443 |
+
logging.error(f"An error occurred: {exc}", exc_info=True)
|
| 444 |
+
raise
|