Choi jun hyeok commited on
Commit
be91dcc
ยท
1 Parent(s): ec5ae24

update prompt

Browse files
Files changed (6) hide show
  1. analysis.py +346 -0
  2. analysis2.py +233 -0
  3. analysis3.py +260 -0
  4. analysis4.py +197 -0
  5. app.py +2 -2
  6. train_and_save_models.py +313 -190
analysis.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ํƒ์ƒ‰์  ๋ฐ์ดํ„ฐ ๋ถ„์„ (Advanced EDA)
4
+
5
+ ์ด ์Šคํฌ๋ฆฝํŠธ๋Š” ๋‹ค์Œ 4๊ฐœ์˜ ๋ฐ์ดํ„ฐ์…‹์„ ํ™œ์šฉํ•˜์—ฌ ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ๋ฅผ ์‹ฌ์ธต ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
6
+ 1. article_metrics_monthly.csv: ๊ธฐ์‚ฌ๋ณ„ ์›”๊ฐ„ ์ง€ํ‘œ (์กฐํšŒ์ˆ˜, ์ข‹์•„์š”, ๋Œ“๊ธ€)
7
+ 2. contents.csv: ๊ธฐ์‚ฌ ์ฝ˜ํ…์ธ  ์ •๋ณด (์นดํ…Œ๊ณ ๋ฆฌ, ์ œ๋ชฉ, ํƒœ๊ทธ ๋“ฑ)
8
+ 3. demographics_merged.csv: ๊ธฐ์‚ฌ๋ณ„ ์ธ๊ตฌํ†ต๊ณ„ํ•™์  ๋…์ž ๋ฐ์ดํ„ฐ
9
+ 4. referrer.csv: ๊ธฐ์‚ฌ๋ณ„ ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ
10
+
11
+ ์ฃผ์š” ๋ถ„์„ ๋‚ด์šฉ:
12
+ - ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ”ผ์ฒ˜ ์—”์ง€๋‹ˆ์–ด๋ง
13
+ - ๊ธฐ์‚ฌ ํ•ต์‹ฌ ์ง€ํ‘œ(์กฐํšŒ์ˆ˜, ์ข‹์•„์š”, ๋Œ“๊ธ€) ๋ถ„ํฌ ๋ฐ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„
14
+ - ์ฝ˜ํ…์ธ  ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์„ฑ๊ณผ ๋ฐ ๋…์ž ์ฐธ์—ฌ๋„ ์‹ฌ์ธต ๋ถ„์„
15
+ - ํƒœ๊ทธ ๋ถ„์„ (Word Cloud ํฌํ•จ)
16
+ - ์ธ๊ตฌํ†ต๊ณ„(์—ฐ๋ น/์„ฑ๋ณ„) ๊ทธ๋ฃน๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ ๋ถ„์„ (ํžˆํŠธ๋งต)
17
+ - ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ์„ฑ๊ณผ ๋ฐ ํšจ์œจ์„ฑ ๋ถ„์„
18
+ - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋„์ถœ ๋ฐ ๋ฆฌํฌํŠธ ์ž๋™ ์ƒ์„ฑ
19
+
20
+ ์‹คํ–‰ ๋ฐฉ๋ฒ•:
21
+ - ์Šคํฌ๋ฆฝํŠธ๋ฅผ ์‹คํ–‰ํ•˜๊ธฐ ์ „, DATA_DIR ๊ฒฝ๋กœ๋ฅผ ์‹ค์ œ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š” ํด๋”๋กœ ์ˆ˜์ •ํ•˜์„ธ์š”.
22
+ - ์‹คํ–‰ ์‹œ ์Šคํฌ๋ฆฝํŠธ์™€ ๋™์ผํ•œ ์œ„์น˜์— 'output' ํด๋”๊ฐ€ ์ƒ์„ฑ๋˜๋ฉฐ, ๋ชจ๋“  ์‹œ๊ฐํ™” ์ž๋ฃŒ์™€ ์ตœ์ข… ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ๊ฐ€ ์ €์žฅ๋ฉ๋‹ˆ๋‹ค.
23
+ """
24
+
25
+ # 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
26
+ import pandas as pd
27
+ import numpy as np
28
+ import matplotlib.pyplot as plt
29
+ import seaborn as sns
30
+ from datetime import datetime
31
+ import warnings
32
+ import os
33
+ from wordcloud import WordCloud
34
+
35
+ warnings.filterwarnings('ignore')
36
+
37
+ # 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜
38
+ def setup_environment():
39
+ """๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • (๊ฒฝ๋กœ, ์‹œ๊ฐํ™” ์Šคํƒ€์ผ)"""
40
+ # === ๊ฒฝ๋กœ ์„ค์ • (์‚ฌ์šฉ์ž ํ™˜๊ฒฝ์— ๋งž๊ฒŒ ์ˆ˜์ •) ===
41
+ DATA_DIR = r'Broadcast_paper\data_csv'
42
+ OUTPUT_DIR = r'./output_analysis'
43
+
44
+ # ์ถœ๋ ฅ ํด๋” ์ƒ์„ฑ
45
+ if not os.path.exists(OUTPUT_DIR):
46
+ os.makedirs(OUTPUT_DIR)
47
+ print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
48
+
49
+ # === ์‹œ๊ฐํ™” ์„ค์ • ===
50
+ plt.rc('font', family='Malgun Gothic')
51
+ plt.rcParams['axes.unicode_minus'] = False
52
+ sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
53
+
54
+ print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
55
+ return DATA_DIR, OUTPUT_DIR
56
+
57
+ # 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
58
+ def load_and_preprocess_data(data_dir):
59
+ """๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•˜๊ณ  ๊ธฐ๋ณธ ์ „์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค."""
60
+ print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
61
+
62
+ # ๋ฐ์ดํ„ฐ ๋กœ๋“œ
63
+ df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
64
+ df_contents = pd.read_csv(f'{data_dir}/contents.csv')
65
+ df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
66
+ df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
67
+
68
+ # --- ์ „์ฒ˜๋ฆฌ ---
69
+ # 1. df_metrics
70
+ df_metrics['period'] = pd.to_datetime(df_metrics['period'])
71
+ df_metrics['comments'].fillna(0, inplace=True) # ๋Œ“๊ธ€ ๊ฒฐ์ธก์น˜๋Š” 0์œผ๋กœ ์ฒ˜๋ฆฌ
72
+
73
+ # 2. df_contents
74
+ df_contents.dropna(subset=['category', 'content', 'date'], inplace=True) # ์ฃผ์š” ์ •๋ณด ๊ฒฐ์ธก ํ–‰ ์ œ๊ฑฐ
75
+ df_contents['date'] = pd.to_datetime(df_contents['date'])
76
+ df_contents['publish_month'] = df_contents['date'].dt.to_period('M')
77
+ df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
78
+ df_contents['content_length'] = df_contents['content'].str.len()
79
+
80
+ # 3. df_demo
81
+ df_demo_filtered = df_demo[df_demo['age_group'] != '์ „์ฒด'].copy()
82
+
83
+ # 4. ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
84
+ # ์›”๋ณ„ ์ง€ํ‘œ๋ฅผ ๊ธฐ์‚ฌ๋ณ„ ์ด๊ณ„๋กœ ์ง‘๊ณ„
85
+ article_total_metrics = df_metrics.groupby('article_id').agg({
86
+ 'views_total': 'sum',
87
+ 'likes': 'sum',
88
+ 'comments': 'sum'
89
+ }).reset_index()
90
+
91
+ # ์ฝ˜ํ…์ธ  ์ •๋ณด์™€ ๊ธฐ์‚ฌ๋ณ„ ์ด๊ณ„ ์ง€ํ‘œ ๋ณ‘ํ•ฉ
92
+ df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
93
+ df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
94
+
95
+ # ์ฐธ์—ฌ๋„(Engagement Rate) ๊ณ„์‚ฐ: (์ข‹์•„์š” + ๋Œ“๊ธ€) / ์กฐํšŒ์ˆ˜
96
+ # ์กฐํšŒ์ˆ˜๊ฐ€ 0์ธ ๊ฒฝ์šฐ ์˜ค๋ฅ˜ ๋ฐฉ์ง€
97
+ df_merged['engagement_rate'] = (
98
+ (df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)
99
+ ) * 100
100
+
101
+ print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
102
+
103
+ return {
104
+ "metrics": df_metrics,
105
+ "contents": df_contents,
106
+ "demo": df_demo_filtered,
107
+ "referrer": df_referrer,
108
+ "merged": df_merged
109
+ }
110
+
111
+ # 4. ์ƒ์„ธ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ํ•จ์ˆ˜๋“ค
112
+ def analyze_metrics_overview(df_merged, output_dir):
113
+ """๊ธฐ์‚ฌ ์ง€ํ‘œ์˜ ์ „๋ฐ˜์ ์ธ ๋ถ„ํฌ์™€ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค."""
114
+ print("\n[๋‹จ๊ณ„ 2] ๊ธฐ์‚ฌ ์ง€ํ‘œ ์ „๋ฐ˜ ๋ถ„์„...")
115
+
116
+ fig, axes = plt.subplots(1, 2, figsize=(18, 7))
117
+
118
+ # ์กฐํšŒ์ˆ˜, ์ข‹์•„์š”, ๋Œ“๊ธ€ ๋ถ„ํฌ
119
+ sns.histplot(data=df_merged, x='views_total', bins=50, ax=axes[0], kde=True)
120
+ axes[0].set_title('๊ธฐ์‚ฌ๋ณ„ ์ด ์กฐํšŒ์ˆ˜ ๋ถ„ํฌ', fontsize=16)
121
+ axes[0].set_xlabel('์ด ์กฐํšŒ์ˆ˜')
122
+ axes[0].set_ylabel('๊ธฐ์‚ฌ ์ˆ˜')
123
+ axes[0].set_xlim(0, df_merged['views_total'].quantile(0.95)) # ์ƒ์œ„ 5% ์ด์ƒ์€ ์ œ์™ธํ•˜์—ฌ ๋ถ„ํฌ ํ™•์ธ
124
+
125
+ # ์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต
126
+ corr = df_merged[['views_total', 'likes', 'comments', 'content_length']].corr()
127
+ sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
128
+ axes[1].set_title('์ฃผ์š” ์ง€ํ‘œ ๊ฐ„ ์ƒ๊ด€๊ด€๊ณ„', fontsize=16)
129
+
130
+ plt.tight_layout()
131
+ plt.savefig(f'{output_dir}/metrics_overview.png')
132
+ plt.close()
133
+ print(" - ๊ธฐ์‚ฌ ์ง€ํ‘œ ๋ถ„ํฌ ๋ฐ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„ ์™„๋ฃŒ. (metrics_overview.png ์ €์žฅ)")
134
+
135
+ def analyze_content_features(df_merged, output_dir):
136
+ """์ฝ˜ํ…์ธ  ํŠน์ง•(์นดํ…Œ๊ณ ๋ฆฌ, ํƒœ๊ทธ, ๊ธ€์ž ์ˆ˜, ๋ฐœํ–‰ ์š”์ผ)์— ๋”ฐ๋ฅธ ์„ฑ๊ณผ ๋ถ„์„"""
137
+ print("\n[๋‹จ๊ณ„ 3] ์ฝ˜ํ…์ธ  ํŠน์ง•๋ณ„ ์„ฑ๊ณผ ๋ถ„์„...")
138
+
139
+ # ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์ง€ํ‘œ
140
+ category_performance = df_merged.groupby('category').agg({
141
+ 'views_total': 'mean',
142
+ 'likes': 'mean',
143
+ 'comments': 'mean',
144
+ 'engagement_rate': 'mean'
145
+ }).sort_values('views_total', ascending=False)
146
+
147
+ fig, ax = plt.subplots(figsize=(14, 10))
148
+ category_performance['views_total'].sort_values().plot(kind='barh', ax=ax, color='skyblue')
149
+ ax.set_title('์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
150
+ ax.set_xlabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
151
+ ax.set_ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
152
+ plt.tight_layout()
153
+ plt.savefig(f'{output_dir}/category_avg_views.png')
154
+ plt.close()
155
+ print(" - ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜ ๋ถ„์„ ์™„๋ฃŒ. (category_avg_views.png ์ €์žฅ)")
156
+
157
+ # ํƒœ๊ทธ ๋ถ„์„ ๋ฐ Word Cloud
158
+ tags = df_merged['tag'].dropna().str.split(',').explode().str.strip()
159
+ top_tags = tags.value_counts().head(50)
160
+
161
+ wordcloud = WordCloud(
162
+ font_path='malgun',
163
+ width=1000,
164
+ height=600,
165
+ background_color='white',
166
+ colormap='viridis'
167
+ ).generate_from_frequencies(top_tags)
168
+
169
+ plt.figure(figsize=(15, 9))
170
+ plt.imshow(wordcloud, interpolation='bilinear')
171
+ plt.axis('off')
172
+ plt.title('์ƒ์œ„ 50๊ฐœ ํƒœ๊ทธ Word Cloud', fontsize=20)
173
+ plt.tight_layout()
174
+ plt.savefig(f'{output_dir}/tags_wordcloud.png')
175
+ plt.close()
176
+ print(" - ํƒœ๊ทธ Word Cloud ์ƒ์„ฑ ์™„๋ฃŒ. (tags_wordcloud.png ์ €์žฅ)")
177
+
178
+ # ๋ฐœํ–‰ ์š”์ผ๋ณ„ ๊ธฐ์‚ฌ ์ˆ˜ ๋ฐ ํ‰๊ท  ์กฐํšŒ์ˆ˜
179
+ fig, axes = plt.subplots(1, 2, figsize=(18, 7))
180
+ day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
181
+
182
+ sns.countplot(data=df_merged, y='publish_dayofweek', order=day_order, ax=axes[0], palette='pastel')
183
+ axes[0].set_title('์š”์ผ๋ณ„ ๋ฐœํ–‰ ๊ธฐ์‚ฌ ์ˆ˜', fontsize=16)
184
+ axes[0].set_xlabel('๊ธฐ์‚ฌ ์ˆ˜')
185
+ axes[0].set_ylabel('์š”์ผ')
186
+
187
+ sns.barplot(data=df_merged, y='publish_dayofweek', x='views_total', order=day_order, ax=axes[1], palette='pastel', ci=None)
188
+ axes[1].set_title('์š”์ผ๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
189
+ axes[1].set_xlabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
190
+ axes[1].set_ylabel('')
191
+
192
+ plt.tight_layout()
193
+ plt.savefig(f'{output_dir}/dayofweek_performance.png')
194
+ plt.close()
195
+ print(" - ๋ฐœํ–‰ ์š”์ผ๋ณ„ ์„ฑ๊ณผ ๋ถ„์„ ์™„๋ฃŒ. (dayofweek_performance.png ์ €์žฅ)")
196
+
197
+ def analyze_demographics(df_demo, df_merged, output_dir):
198
+ """์ธ๊ตฌํ†ต๊ณ„ํ•™์  ํŠน์„ฑ(์—ฐ๋ น/์„ฑ๋ณ„)์— ๋”ฐ๋ฅธ ์ฝ˜ํ…์ธ  ์†Œ๋น„ ํŒจํ„ด ๋ถ„์„"""
199
+ print("\n[๋‹จ๊ณ„ 4] ์ธ๊ตฌํ†ต๊ณ„ ๊ทธ๋ฃน๋ณ„ ์„ ํ˜ธ๋„ ๋ถ„์„...")
200
+
201
+ # ๊ธฐ์‚ฌ ID๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ธ๊ตฌํ†ต๊ณ„ ๋ฐ์ดํ„ฐ์™€ ์ฝ˜ํ…์ธ  ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
202
+ df_demo_content = pd.merge(df_demo, df_merged[['article_id', 'category']], on='article_id', how='left')
203
+
204
+ # ์—ฐ๋ น๋Œ€ ๋ฐ ์„ฑ๋ณ„์— ๋”ฐ๋ฅธ ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์กฐํšŒ์ˆ˜ ์ง‘๊ณ„
205
+ demo_category_views = df_demo_content.groupby(['age_group', 'gender', 'category'])['views'].sum().reset_index()
206
+
207
+ # ํžˆํŠธ๋งต ์ƒ์„ฑ์„ ์œ„ํ•œ ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”
208
+ # ์—ฌ์„ฑ ๋…์ž
209
+ female_pivot = demo_category_views[demo_category_views['gender'] == '์—ฌ'].pivot_table(
210
+ index='category', columns='age_group', values='views', aggfunc='sum'
211
+ ).fillna(0)
212
+
213
+ # ๋‚จ์„ฑ ๋…์ž
214
+ male_pivot = demo_category_views[demo_category_views['gender'] == '๋‚จ'].pivot_table(
215
+ index='category', columns='age_group', values='views', aggfunc='sum'
216
+ ).fillna(0)
217
+
218
+ # ์‹œ๊ฐํ™”
219
+ fig, axes = plt.subplots(2, 1, figsize=(20, 24))
220
+
221
+ sns.heatmap(female_pivot, cmap='Reds', annot=True, fmt='.0f', linewidths=.5, ax=axes[0])
222
+ axes[0].set_title('์—ฌ์„ฑ ์—ฐ๋ น๋Œ€๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ (์ด ์กฐํšŒ์ˆ˜ ๊ธฐ์ค€)', fontsize=18)
223
+ axes[0].set_xlabel('์—ฐ๋ น๋Œ€')
224
+ axes[0].set_ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
225
+
226
+ sns.heatmap(male_pivot, cmap='Blues', annot=True, fmt='.0f', linewidths=.5, ax=axes[1])
227
+ axes[1].set_title('๋‚จ์„ฑ ์—ฐ๋ น๋Œ€๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ (์ด ์กฐํšŒ์ˆ˜ ๊ธฐ์ค€)', fontsize=18)
228
+ axes[1].set_xlabel('์—ฐ๋ น๋Œ€')
229
+ axes[1].set_ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
230
+
231
+ plt.tight_layout()
232
+ plt.savefig(f'{output_dir}/demographic_category_preference_heatmap.png')
233
+ plt.close()
234
+ print(" - ์ธ๊ตฌํ†ต๊ณ„ ๊ทธ๋ฃน๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ ํžˆํŠธ๋งต ๋ถ„์„ ์™„๋ฃŒ. (demographic_category_preference_heatmap.png ์ €์žฅ)")
235
+
236
+ def analyze_referrer(df_referrer, df_merged, output_dir):
237
+ """์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ๊ธฐ์—ฌ๋„ ๋ฐ ํšจ์œจ์„ฑ ๋ถ„์„"""
238
+ print("\n[๋‹จ๊ณ„ 5] ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํšจ์œจ์„ฑ ๋ถ„์„...")
239
+
240
+ # ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ์™€ ๊ธฐ์‚ฌ ์ง€ํ‘œ ๋ณ‘ํ•ฉ
241
+ df_referrer_merged = pd.merge(df_referrer, df_merged[['article_id', 'views_total', 'engagement_rate']], on='article_id', how='left')
242
+
243
+ # ์ฃผ์š” ์œ ์ž… ๊ฒฝ๋กœ(์ƒ์œ„ 10๊ฐœ) ์ถ”์ถœ
244
+ top_10_referrers = df_referrer_merged.groupby('referrer')['share'].sum().nlargest(10).index
245
+ df_top_referrers = df_referrer_merged[df_referrer_merged['referrer'].isin(top_10_referrers)]
246
+
247
+ # ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํ‰๊ท  ์ฐธ์—ฌ๋„ ๊ณ„์‚ฐ
248
+ referrer_engagement = df_top_referrers.groupby('referrer')['engagement_rate'].mean().sort_values(ascending=False)
249
+
250
+ fig, axes = plt.subplots(1, 2, figsize=(20, 8))
251
+
252
+ # ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ์ด ๊ธฐ์—ฌ๋„
253
+ df_top_referrers.groupby('referrer')['share'].sum().sort_values().plot(kind='barh', ax=axes[0], color='c')
254
+ axes[0].set_title('์ƒ์œ„ 10๊ฐœ ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ์ด ๊ธฐ์—ฌ๋„(Share)', fontsize=16)
255
+ axes[0].set_xlabel('์ด Share')
256
+ axes[0].set_ylabel('์œ ์ž… ๊ฒฝ๋กœ')
257
+
258
+ # ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํ‰๊ท  ์ฐธ์—ฌ๋„
259
+ referrer_engagement.sort_values().plot(kind='barh', ax=axes[1], color='m')
260
+ axes[1].set_title('์ƒ์œ„ 10๊ฐœ ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํ‰๊ท  ์ฐธ์—ฌ๋„(%)', fontsize=16)
261
+ axes[1].set_xlabel('ํ‰๊ท  ์ฐธ์—ฌ๋„ (%)')
262
+ axes[1].set_ylabel('')
263
+
264
+ plt.tight_layout()
265
+ plt.savefig(f'{output_dir}/referrer_performance.png')
266
+ plt.close()
267
+ print(" - ์ฃผ์š” ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ๊ธฐ์—ฌ๋„ ๋ฐ ์ฐธ์—ฌ๋„ ๋ถ„์„ ์™„๋ฃŒ. (referrer_performance.png ์ €์žฅ)")
268
+
269
+ # 5. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ
270
+ def generate_insights_report(data, output_dir):
271
+ """๋ถ„์„ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ข…ํ•ฉ์ ์ธ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."""
272
+ print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ...")
273
+
274
+ # ๋ณด๊ณ ์„œ ๋‚ด์šฉ ์ƒ์„ฑ
275
+ report = f"""
276
+ # ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ
277
+ ์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
278
+
279
+ ## 1. ๋ถ„์„ ๊ฐœ์š”
280
+ - ๋ณธ ๋ณด๊ณ ์„œ๋Š” ๊ธฐ์‚ฌ ์„ฑ๊ณผ ์ง€ํ‘œ, ์ฝ˜ํ…์ธ  ํŠน์„ฑ, ๋…์ž ์ธ๊ตฌํ†ต๊ณ„, ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋…์ž ํ–‰๋™ ํŒจํ„ด์„ ๋ถ„์„ํ•˜๊ณ , ์ด๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์ฝ˜ํ…์ธ  ์ „๋žต ๊ฐœ์„  ๋ฐฉ์•ˆ์„ ์ œ์‹œํ•˜๋Š” ๊ฒƒ์„ ๋ชฉํ‘œ๋กœ ํ•ฉ๋‹ˆ๋‹ค.
281
+ - ์ด {data['merged']['article_id'].nunique():,}๊ฐœ์˜ ๊ธฐ์‚ฌ์™€ ๊ด€๋ จ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ–ˆ์Šต๋‹ˆ๋‹ค.
282
+
283
+ ## 2. ์ฃผ์š” ๋ถ„์„ ๊ฒฐ๊ณผ (Key Findings)
284
+
285
+ ### 2.1. ์ฝ˜ํ…์ธ  ์„ฑ๊ณผ
286
+ - **์„ฑ๊ณผ ๋ถ„ํฌ**: ๋Œ€๋ถ€๋ถ„์˜ ๊ธฐ์‚ฌ๋Š” ์†Œ์ˆ˜์˜ ์กฐํšŒ์ˆ˜๋ฅผ ๊ธฐ๋กํ•˜๋ฉฐ, ์†Œ์ˆ˜์˜ 'ํžˆํŠธ ๊ธฐ์‚ฌ'๊ฐ€ ์ „์ฒด ์กฐํšŒ์ˆ˜๋ฅผ ๊ฒฌ์ธํ•˜๋Š” ๋กฑํ…Œ์ผ(Long-tail) ๋ถ„ํฌ๋ฅผ ๋ณด์ž…๋‹ˆ๋‹ค. (metrics_overview.png ์ฐธ๊ณ )
287
+ - **ํ•ต์‹ฌ ์นดํ…Œ๊ณ ๋ฆฌ**: '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ', '์•„์ด๋””์–ด์Šค', '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ' ์นดํ…Œ๊ณ ๋ฆฌ๊ฐ€ ํ‰๊ท  ์กฐํšŒ์ˆ˜ ์ตœ์ƒ์œ„๊ถŒ์„ ์ฐจ์ง€ํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด๋“ค ์นดํ…Œ๊ณ ๋ฆฌ๊ฐ€ ๋…์ž์˜ ๋†’์€ ๊ด€์‹ฌ์„ ์œ ๋„ํ•˜๋Š” ํ•ต์‹ฌ ์ฝ˜ํ…์ธ ์ž„์„ ์‹œ์‚ฌํ•ฉ๋‹ˆ๋‹ค. (category_avg_views.png ์ฐธ๊ณ )
288
+ - **์ฃผ์š” ํƒœ๊ทธ**: '#์–ธ๋ก ', '#๊ธฐ์ž', '#๋‰ด์Šค', '#๋ฏธ๋””์–ด', '#์ €๋„๋ฆฌ์ฆ˜' ๋“ฑ ์–ธ๋ก  ๋ณธ์งˆ๊ณผ ๊ด€๋ จ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ๊ฐ€์žฅ ๋นˆ๋ฒˆํ•˜๊ฒŒ ์‚ฌ์šฉ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. '#์ธ๊ณต์ง€๋Šฅ', '#AI', '#ํ…Œํฌ' ๋“ฑ ๊ธฐ์ˆ  ๊ด€๋ จ ํƒœ๊ทธ๋„ ์ƒ์œ„๊ถŒ์— ์œ„์น˜ํ•˜์—ฌ ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ์— ๋Œ€ํ•œ ๋†’์€ ๊ด€์‹ฌ์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค. (tags_wordcloud.png ์ฐธ๊ณ )
289
+
290
+ ### 2.2. ๋…์ž ํŠน์„ฑ
291
+ - **์ฃผ์š” ๋…์ž์ธต**: 10๋Œ€ ํ›„๋ฐ˜์—์„œ 30๋Œ€ ์ดˆ๋ฐ˜์˜ ์ Š์€ ์ธต์ด ์ฝ˜ํ…์ธ  ์†Œ๋น„์˜ ํ•ต์‹ฌ ๊ทธ๋ฃน์ž…๋‹ˆ๋‹ค. ํŠนํžˆ 19-24์„ธ ์—ฌ์„ฑ ๊ทธ๋ฃน์˜ ํ™œ๋™์ด ๋‘๋“œ๋Ÿฌ์ง‘๋‹ˆ๋‹ค.
292
+ - **์„ฑ๋ณ„/์—ฐ๋ น๋ณ„ ์„ ํ˜ธ๋„**:
293
+ - **์—ฌ์„ฑ**: 10๋Œ€-20๋Œ€ ์ดˆ๋ฐ˜์€ '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '๋ฏธ๋””์–ดํฌ๋Ÿผ'์—, 20๋Œ€ ํ›„๋ฐ˜-30๋Œ€๋Š” '์ทจ์žฌ๊ธฐยท์ œ์ž‘๊ธฐ', '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ' ๋“ฑ ์‹ฌ์ธต์ ์ธ ์ฝ˜ํ…์ธ ์— ๋†’์€ ๋ฐ˜์‘์„ ๋ณด์ž…๋‹ˆ๋‹ค.
294
+ - **๋‚จ์„ฑ**: 20๋Œ€-30๋Œ€ ๊ทธ๋ฃน์ด ์ „๋ฐ˜์ ์ธ ์†Œ๋น„๋ฅผ ์ฃผ๋„ํ•˜๋ฉฐ, ํŠนํžˆ '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '์ง‘์ค‘์ ๊ฒ€'๊ณผ ๊ฐ™์€ ์‹œ์‚ฌ/๊ธฐํš ๊ธฐ์‚ฌ์— ๋Œ€ํ•œ ๊ด€์‹ฌ์ด ๋†’์Šต๋‹ˆ๋‹ค.
295
+ - (demographic_category_preference_heatmap.png ์ฐธ๊ณ )
296
+
297
+ ### 2.3. ์œ ์ž… ๊ฒฝ๋กœ ํšจ์œจ์„ฑ
298
+ - **์ฃผ์š” ์œ ์ž… ์ฑ„๋„**: 'Google'๊ณผ '๋„ค์ด๋ฒ„' ๊ด€๋ จ ์ฑ„๋„(ํ†ตํ•ฉ๊ฒ€์ƒ‰, ๋ธ”๋กœ๊ทธ ๋“ฑ)์ด ์ „์ฒด ํŠธ๋ž˜ํ”ฝ์˜ ์••๋„์ ์ธ ๋น„์ค‘์„ ์ฐจ์ง€ํ•ฉ๋‹ˆ๋‹ค. ๊ฒ€์ƒ‰ ์—”์ง„ ์ตœ์ ํ™”(SEO)์˜ ์ค‘์š”์„ฑ์ด ๋งค์šฐ ํฝ๋‹ˆ๋‹ค.
299
+ - **๊ณ ํ’ˆ์งˆ ํŠธ๋ž˜ํ”ฝ**: '๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ๊ฒ€์ƒ‰'์€ ๋†’์€ ํŠธ๋ž˜ํ”ฝ ๊ธฐ์—ฌ๋„์™€ ํ•จ๊ป˜ ์–‘ํ˜ธํ•œ ๋…์ž ์ฐธ์—ฌ๋„๋ฅผ ๋ณด์—ฌ์ฃผ๋Š” ํšจ์œจ์ ์ธ ์ฑ„๋„์ž…๋‹ˆ๋‹ค. ๋ฐ˜๋ฉด, 'Google'์€ ๊ฐ€์žฅ ๋งŽ์€ ํŠธ๋ž˜ํ”ฝ์„ ์œ ์ž…์‹œํ‚ค์ง€๋งŒ, ํ‰๊ท  ์ฐธ์—ฌ๋„๋Š” ์ƒ๋Œ€์ ์œผ๋กœ ๋‚ฎ์•„ ๋„“์€ ๋ฒ”์œ„์˜ ์ผ๋ฐ˜ ๋…์ž ์œ ์ž…์ด ๋งŽ์„ ๊ฒƒ์œผ๋กœ ์ถ”์ •๋ฉ๋‹ˆ๋‹ค. (referrer_performance.png ์ฐธ๊ณ )
300
+
301
+ ## 3. ์ „๋žต์  ์ œ์–ธ (Strategic Recommendations)
302
+
303
+ 1. **์ฝ˜ํ…์ธ  ๊ฐœ์ธํ™” ๋ฐ ํƒ€๊ฒŸํŒ… ๊ฐ•ํ™”**:
304
+ - **ํ•ต์‹ฌ ๋…์ž์ธต(19-34์„ธ) ์ง‘์ค‘**: ์ด๋“ค์ด ์„ ํ˜ธํ•˜๋Š” '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๏ฟฝ๏ฟฝ', '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ'์™€ ๊ฐ™์€ ์‹ฌ์ธต ๋ถ„์„ ๋ฐ ํŠธ๋ Œ๋“œ ๊ด€๋ จ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฐ•ํ™”ํ•˜๊ณ , ๊ด€๋ จ ์‹ ๊ทœ ๊ธฐํš์„ ๋ฐœ๊ตดํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
305
+ - **์ž ์žฌ ๋…์ž์ธต(40๋Œ€ ์ด์ƒ) ๊ณต๋žต**: 40๋Œ€ ์ด์ƒ ๋‚จ๋…€๊ฐ€ ๊ณตํ†ต์ ์œผ๋กœ ๊ด€์‹ฌ์„ ๋ณด์ด๋Š” '์ง‘์ค‘์ ๊ฒ€', '๋ฏธ๋””์–ดํ˜„์žฅ' ์นดํ…Œ๊ณ ๋ฆฌ ์ฝ˜ํ…์ธ ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์ด ์—ฐ๋ น๋Œ€์— ํŠนํ™”๋œ ์ฃผ์ œ(์˜ˆ: ๋ฏธ๋””์–ด ๋ฆฌํ„ฐ๋Ÿฌ์‹œ, ๊ฐ€์งœ๋‰ด์Šค ํŒ๋ณ„)๋กœ ํ™•์žฅํ•˜๋Š” ์ „๋žต์„ ๊ณ ๋ คํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
306
+
307
+ 2. **๊ฒ€์ƒ‰์—”์ง„ ์ตœ์ ํ™”(SEO) ๊ณ ๋„ํ™”**:
308
+ - **์ฝ˜ํ…์ธ -ํƒœ๊ทธ ์—ฐ๊ณ„**: Word Cloud ๋ถ„์„์—์„œ ๋„์ถœ๋œ '#AI', '#๋””์ง€ํ„ธ', '#ํ”Œ๋žซํผ' ๋“ฑ์˜ ์ธ๊ธฐ ๊ธฐ์ˆ  ํƒœ๊ทธ์™€ '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '์ง‘์ค‘์ ๊ฒ€'๊ณผ ๊ฐ™์€ ์ธ๊ธฐ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์กฐํ•ฉํ•œ ์ฝ˜ํ…์ธ ๋ฅผ ๊ธฐํšํ•˜์—ฌ ๊ฒ€์ƒ‰ ๋…ธ์ถœ ๊ฐ€๋Šฅ์„ฑ์„ ๊ทน๋Œ€ํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
309
+ - **๋ธ”๋กœ๊ทธ ์ฑ„๋„ ํ™œ์šฉ**: '๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ'๊ฐ€ ์–‘์งˆ์˜ ๋…์ž๋ฅผ ์œ ์ž…์‹œํ‚ค๋Š” ํ•ต์‹ฌ ์ฑ„๋„์ž„์ด ํ™•์ธ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์นด๋“œ๋‰ด์Šค๋‚˜ ๊ธฐ์‚ฌ ์š”์•ฝ๋ณธ ๋“ฑ ๋ธ”๋กœ๊ทธ ํ”Œ๋žซํผ์— ์ตœ์ ํ™”๋œ 2์ฐจ ์ฝ˜ํ…์ธ ๋ฅผ ์ œ์ž‘ํ•˜์—ฌ ๋ฐฐํฌํ•˜๋Š” ์ „๋žต์ด ์œ ํšจํ•ฉ๋‹ˆ๋‹ค.
310
+
311
+ 3. **๋…์ž ์ฐธ์—ฌ๋„ ์ฆ์ง„ ์ „๋žต**:
312
+ - **์ฐธ์—ฌ๋„ ๋†’์€ ์นดํ…Œ๊ณ ๋ฆฌ ๋ฒค์น˜๋งˆํ‚น**: '๊ธ€๋กœ๋ฒŒ ๋ฏธ๋””์–ด ํ˜„์žฅ', '๋ฏธ๋””์–ด ๋ฆฌ๋ทฐ' ๋“ฑ ์ฐธ์—ฌ๋„๊ฐ€ ๋†’์€ ์นดํ…Œ๊ณ ๋ฆฌ์˜ ํ˜•์‹(์˜ˆ: ์ „๋ฌธ๊ฐ€ ์ธํ„ฐ๋ทฐ, ํŠน์ • ์‚ฌ๋ก€ ์‹ฌ์ธต ๋ถ„์„, ๋ช…ํ™•ํ•œ ์ฃผ์žฅ ์ œ์‹œ)์„ ๋‹ค๋ฅธ ๊ธฐ์‚ฌ์— ์ ์šฉํ•ด ๋ณผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
313
+ - **์ธํ„ฐ๋ž™ํ‹ฐ๋ธŒ ์š”์†Œ ๋„์ž…**: ๊ธฐ์‚ฌ ๋ง๋ฏธ์— ๊ด€๋ จ ์ฃผ์ œ์— ๋Œ€ํ•œ ๋…์ž ์˜๊ฒฌ์„ ๋ฌป๋Š” ์งˆ๋ฌธ์„ ์ถ”๊ฐ€ํ•˜๊ฑฐ๋‚˜, ํˆฌํ‘œ ๊ธฐ๋Šฅ์„ ํ™œ์šฉํ•˜์—ฌ ๋Œ“๊ธ€ ๋ฐ ์ƒํ˜ธ์ž‘์šฉ์„ ์œ ๋„ํ•˜๋Š” ๋ฐฉ์•ˆ์„ ๊ฒ€ํ† ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
314
+ """
315
+ # ๋ฆฌํฌํŠธ ํŒŒ์ผ๋กœ ์ €์žฅ
316
+ report_path = f'{output_dir}/comprehensive_analysis_report.txt'
317
+ with open(report_path, 'w', encoding='utf-8') as f:
318
+ f.write(report)
319
+
320
+ print(f" - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")
321
+
322
+ # 6. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
323
+ def main():
324
+ """์Šคํฌ๋ฆฝํŠธ์˜ ๋ฉ”์ธ ์‹คํ–‰ ๋กœ์ง"""
325
+ print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ =====")
326
+
327
+ # 1. ํ™˜๊ฒฝ ์„ค์ •
328
+ data_dir, output_dir = setup_environment()
329
+
330
+ # 2. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
331
+ all_data = load_and_preprocess_data(data_dir)
332
+
333
+ # 3. ์ƒ์„ธ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ์‹คํ–‰
334
+ analyze_metrics_overview(all_data['merged'], output_dir)
335
+ analyze_content_features(all_data['merged'], output_dir)
336
+ analyze_demographics(all_data['demo'], all_data['merged'], output_dir)
337
+ analyze_referrer(all_data['referrer'], all_data['merged'], output_dir)
338
+
339
+ # 4. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ
340
+ generate_insights_report(all_data, output_dir)
341
+
342
+ print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
343
+ print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
344
+
345
+ if __name__ == '__main__':
346
+ main()
analysis2.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต EDA (AI ๋ชจ๋ธ ํƒ€๋‹น์„ฑ ๊ฒ€์ฆ ๊ด€์  ์ถ”๊ฐ€)
4
+
5
+ ๊ธฐ์กด ๋ถ„์„์— ๋”ํ•ด, AI ์ œ๋ชฉ/์„ค๋ช… ์ƒ์„ฑ ๋ฐ RAG ๊ธฐ๋ฐ˜ ์„ฑ๊ณผ ์˜ˆ์ธก ๋ชจ๋ธ์˜
6
+ ํ•„์š”์„ฑ๊ณผ ํƒ€๋‹น์„ฑ์„ ๋ฐ์ดํ„ฐ๋กœ ์ฆ๋ช…ํ•˜๊ธฐ ์œ„ํ•œ ๋ถ„์„์„ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
7
+
8
+ ์ถ”๊ฐ€ ๋ถ„์„ ๋‚ด์šฉ:
9
+ - ์„ฑ๊ณต์ ์ธ ๊ธฐ์‚ฌ ์ œ๋ชฉ์˜ ๊ตฌ์กฐ์  ํŠน์ง• ๋ถ„์„ (๊ธธ์ด, ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€ ๋“ฑ)
10
+ - RAG ๋ชจ๋ธ์˜ ๊ทผ๊ฑฐ ๋งˆ๋ จ์„ ์œ„ํ•œ '์ฃผ์ œ ๊ตฐ์ง‘๋ณ„ ์„ฑ๊ณต๋ฅ ' ๋ถ„์„
11
+ """
12
+
13
+ # 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ (๊ธฐ์กด๊ณผ ๋™์ผ)
14
+ import pandas as pd
15
+ import numpy as np
16
+ import matplotlib.pyplot as plt
17
+ import seaborn as sns
18
+ from datetime import datetime
19
+ import warnings
20
+ import os
21
+ from wordcloud import WordCloud
22
+
23
+ warnings.filterwarnings('ignore')
24
+
25
+ # 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜ (๊ธฐ์กด๊ณผ ๋™์ผ)
26
+ def setup_environment():
27
+ DATA_DIR = r'Broadcast_paper\data_csv'
28
+ OUTPUT_DIR = r'./output_analysis_v2' # ๊ฒฐ๊ณผ ์ €์žฅ ํด๋” ๋ณ€๊ฒฝ
29
+ if not os.path.exists(OUTPUT_DIR):
30
+ os.makedirs(OUTPUT_DIR)
31
+ print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
32
+ plt.rc('font', family='Malgun Gothic')
33
+ plt.rcParams['axes.unicode_minus'] = False
34
+ sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
35
+ print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
36
+ return DATA_DIR, OUTPUT_DIR
37
+
38
+ # 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ (๊ธฐ์กด๊ณผ ๋™์ผ)
39
+ def load_and_preprocess_data(data_dir):
40
+ print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
41
+ df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
42
+ df_contents = pd.read_csv(f'{data_dir}/contents.csv')
43
+ df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
44
+ df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
45
+
46
+ df_metrics['period'] = pd.to_datetime(df_metrics['period'])
47
+ df_metrics['comments'].fillna(0, inplace=True)
48
+ df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
49
+ df_contents['date'] = pd.to_datetime(df_contents['date'])
50
+ df_contents['publish_month'] = df_contents['date'].dt.to_period('M')
51
+ df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
52
+ df_contents['content_length'] = df_contents['content'].str.len()
53
+ df_demo_filtered = df_demo[df_demo['age_group'] != '์ „์ฒด'].copy()
54
+
55
+ article_total_metrics = df_metrics.groupby('article_id').agg({
56
+ 'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
57
+ }).reset_index()
58
+
59
+ df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
60
+ df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
61
+ df_merged['engagement_rate'] = ((df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)) * 100
62
+
63
+ print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
64
+ return {
65
+ "metrics": df_metrics, "contents": df_contents, "demo": df_demo_filtered,
66
+ "referrer": df_referrer, "merged": df_merged
67
+ }
68
+
69
+ # 4. ์ƒ์„ธ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ํ•จ์ˆ˜๋“ค
70
+ # (analyze_metrics_overview, analyze_content_features, analyze_demographics, analyze_referrer ํ•จ์ˆ˜๋Š” ๊ธฐ์กด๊ณผ ๋™์ผํ•˜๊ฒŒ ์œ ์ง€)
71
+
72
+ # ==============================================================================
73
+ # โ˜…โ˜…โ˜…โ˜…โ˜… AI ๋ชจ๋ธ ํƒ€๋‹น์„ฑ ๊ฒ€์ฆ์„ ์œ„ํ•œ ์‹ ๊ทœ ๋ถ„์„ ํ•จ์ˆ˜ โ˜…โ˜…โ˜…โ˜…โ˜…
74
+ # ==============================================================================
75
+
76
+ def analyze_title_performance(df_merged, output_dir):
77
+ """
78
+ ์ œ๋ชฉ์˜ ํŠน์„ฑ(๊ธธ์ด, ํ‚ค์›Œ๋“œ, ์ˆซ์ž, ์งˆ๋ฌธ ํ˜•์‹)์ด ๊ธฐ์‚ฌ ์„ฑ๊ณผ์— ๋ฏธ์น˜๋Š” ์˜ํ–ฅ์„ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
79
+ ์ด๋Š” 'AI๋ฅผ ํ†ตํ•œ ์ œ๋ชฉ ์ตœ์ ํ™”'์˜ ํ•„์š”์„ฑ์„ ๋’ท๋ฐ›์นจํ•ฉ๋‹ˆ๋‹ค.
80
+ """
81
+ print("\n[์‹ ๊ทœ ๋ถ„์„ 1] ์ œ๋ชฉ ํŠน์„ฑ๊ณผ ๊ธฐ์‚ฌ ์„ฑ๊ณผ ์—ฐ๊ด€์„ฑ ๋ถ„์„...")
82
+
83
+ # 1. ํ”ผ์ฒ˜ ์—”์ง€๋‹ˆ์–ด๋ง
84
+ df_copy = df_merged.copy()
85
+ df_copy['title_length'] = df_copy['title'].str.len()
86
+
87
+ # ์ƒ์œ„ 20๊ฐœ ํƒœ๊ทธ๋ฅผ ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋กœ ์ •์˜
88
+ tags = df_copy['tag'].dropna().str.split(',').explode().str.strip()
89
+ top_20_tags = tags.value_counts().head(20).index.str.replace('#', '')
90
+
91
+ df_copy['has_keyword_in_title'] = df_copy['title'].apply(
92
+ lambda x: any(tag in x for tag in top_20_tags)
93
+ )
94
+ df_copy['has_number_in_title'] = df_copy['title'].str.contains(r'\d')
95
+ df_copy['is_question_title'] = df_copy['title'].str.endswith('?')
96
+
97
+ # 2. ์‹œ๊ฐํ™”
98
+ fig, axes = plt.subplots(2, 2, figsize=(20, 14))
99
+ fig.suptitle('์ œ๋ชฉ ํŠน์„ฑ์— ๋”ฐ๋ฅธ ๊ธฐ์‚ฌ ์„ฑ๊ณผ ๋ถ„์„ (ํ‰๊ท  ์กฐํšŒ์ˆ˜)', fontsize=20, y=1.02)
100
+
101
+ # ์ œ๋ชฉ ๊ธธ์ด
102
+ df_copy['title_len_group'] = pd.qcut(df_copy['title_length'], q=4, labels=['๋งค์šฐ ์งง์Œ', '์งง์Œ', '๊น€', '๋งค์šฐ ๊น€'])
103
+ sns.barplot(data=df_copy, x='title_len_group', y='views_total', ax=axes[0, 0], palette='viridis', ci=None)
104
+ axes[0, 0].set_title('์ œ๋ชฉ ๊ธธ์ด๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
105
+ axes[0, 0].set_xlabel('์ œ๋ชฉ ๊ธธ์ด ๊ทธ๋ฃน')
106
+ axes[0, 0].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
107
+
108
+ # ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€
109
+ sns.barplot(data=df_copy, x='has_keyword_in_title', y='views_total', ax=axes[0, 1], palette='plasma', ci=None)
110
+ axes[0, 1].set_title('์ œ๋ชฉ ๋‚ด ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
111
+ axes[0, 1].set_xlabel('ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€')
112
+ axes[0, 1].set_ylabel('')
113
+
114
+ # ์ˆซ์ž ํฌํ•จ ์—ฌ๋ถ€
115
+ sns.barplot(data=df_copy, x='has_number_in_title', y='views_total', ax=axes[1, 0], palette='magma', ci=None)
116
+ axes[1, 0].set_title('์ œ๋ชฉ ๋‚ด ์ˆซ์ž ํฌํ•จ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
117
+ axes[1, 0].set_xlabel('์ˆซ์ž ํฌํ•จ ์—ฌ๋ถ€')
118
+ axes[1, 0].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
119
+
120
+ # ์งˆ๋ฌธ ํ˜•์‹ ์—ฌ๋ถ€
121
+ sns.barplot(data=df_copy, x='is_question_title', y='views_total', ax=axes[1, 1], palette='cividis', ci=None)
122
+ axes[1, 1].set_title('์งˆ๋ฌธ ํ˜•์‹ ์ œ๋ชฉ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
123
+ axes[1, 1].set_xlabel('์งˆ๋ฌธ ํ˜•์‹ ์—ฌ๋ถ€')
124
+ axes[1, 1].set_ylabel('')
125
+
126
+ plt.tight_layout()
127
+ plt.savefig(f'{output_dir}/title_characteristics_performance.png')
128
+ plt.close()
129
+ print(" - ์ œ๋ชฉ ํŠน์„ฑ ๋ถ„์„ ์™„๋ฃŒ. (title_characteristics_performance.png ์ €์žฅ)")
130
+
131
+ def analyze_topic_clusters_for_rag(df_merged, output_dir):
132
+ """
133
+ ์ฃผ์ œ(์นดํ…Œ๊ณ ๋ฆฌ)๋ณ„๋กœ ์„ฑ๊ณต์ ์ธ ๊ธฐ์‚ฌ๊ฐ€ ์–ผ๋งˆ๋‚˜ ์ง‘์ค‘๋˜์–ด ์žˆ๋Š”์ง€ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
134
+ ์ด๋Š” '์œ ์‚ฌํ•œ ๊ณผ๊ฑฐ ์„ฑ๊ณต ๊ธฐ์‚ฌ'๋ฅผ ์ฐธ์กฐํ•˜๋Š” RAG ๋ชจ๋ธ์˜ ์˜ˆ์ธก ํƒ€๋‹น์„ฑ์„ ๋’ท๋ฐ›์นจํ•ฉ๋‹ˆ๋‹ค.
135
+ """
136
+ print("\n[์‹ ๊ทœ ๋ถ„์„ 2] ์ฃผ์ œ ๊ตฐ์ง‘๋ณ„ ์„ฑ๊ณต๋ฅ  ๋ถ„์„ (RAG ๋ชจ๋ธ ๊ทผ๊ฑฐ ๋งˆ๋ จ)...")
137
+
138
+ # 1. '์„ฑ๊ณต ๊ธฐ์‚ฌ' ์ •์˜ (์ƒ์œ„ 20% ์กฐํšŒ์ˆ˜)
139
+ df_copy = df_merged.copy()
140
+ performance_threshold = df_copy['views_total'].quantile(0.8)
141
+ df_copy['is_high_performing'] = df_copy['views_total'] >= performance_threshold
142
+
143
+ # 2. ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๊ธฐ์‚ฌ ์ˆ˜ ๋ฐ ์„ฑ๊ณต ๊ธฐ์‚ฌ ์ˆ˜ ์ง‘๊ณ„
144
+ category_success = df_copy.groupby('category').agg(
145
+ total_articles=('article_id', 'count'),
146
+ high_performing_articles=('is_high_performing', 'sum')
147
+ ).reset_index()
148
+
149
+ # 3. ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์„ฑ๊ณต๋ฅ  ๊ณ„์‚ฐ
150
+ category_success['success_rate'] = (category_success['high_performing_articles'] / category_success['total_articles']) * 100
151
+ category_success = category_success.sort_values('success_rate', ascending=False)
152
+
153
+ # 4. ์‹œ๊ฐํ™”
154
+ plt.figure(figsize=(14, 10))
155
+ sns.barplot(data=category_success, y='category', x='success_rate', palette='coolwarm')
156
+ plt.title('์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์ƒ์œ„ 20% ์„ฑ๊ณผ ๊ธฐ์‚ฌ ๋น„์œจ (์„ฑ๊ณต๋ฅ )', fontsize=18)
157
+ plt.xlabel('์„ฑ๊ณต๋ฅ  (%)')
158
+ plt.ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
159
+ plt.axvline(x=20, color='red', linestyle='--', label='์ „์ฒด ํ‰๊ท  ์„ฑ๊ณต๋ฅ  (20%)')
160
+ plt.legend()
161
+ plt.tight_layout()
162
+ plt.savefig(f'{output_dir}/topic_cluster_success_rate.png')
163
+ plt.close()
164
+ print(" - ์ฃผ์ œ ๊ตฐ์ง‘๋ณ„ ์„ฑ๊ณต๋ฅ  ๋ถ„์„ ์™„๋ฃŒ. (topic_cluster_success_rate.png ์ €์žฅ)")
165
+
166
+ # 5. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ (๋ณด๊ณ ์„œ ๋‚ด์šฉ ์—…๋ฐ์ดํŠธ)
167
+ def generate_insights_report(data, output_dir):
168
+ print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ (AI ๋ชจ๋ธ ๊ฒ€์ฆ ๋‚ด์šฉ ์ถ”๊ฐ€)...")
169
+
170
+ report = f"""
171
+ # ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ (AI ๋ชจ๋ธ ๋„์ž… ํƒ€๋‹น์„ฑ ์ค‘์‹ฌ)
172
+ ์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
173
+
174
+ ## 1. ๋ถ„์„ ๊ฐœ์š”
175
+ - ๋ณธ ๋ณด๊ณ ์„œ๋Š” ๊ธฐ์‚ฌ ์„ฑ๊ณผ, ๋…์ž ํŠน์„ฑ, ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ•˜์—ฌ **AI ๊ธฐ๋ฐ˜ ์ฝ˜ํ…์ธ  ๊ฐœ์ธํ™” ์‹œ์Šคํ…œ** ๋„์ž…์˜ ํ•„์š”์„ฑ๊ณผ ํƒ€๋‹น์„ฑ์„ ๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜์œผ๋กœ ์ฆ๋ช…ํ•˜๋Š” ๊ฒƒ์„ ๋ชฉํ‘œ๋กœ ํ•ฉ๋‹ˆ๋‹ค.
176
+
177
+ ## 2. ์ฃผ์š” ๋ถ„์„ ๊ฒฐ๊ณผ (Key Findings)
178
+ (๊ธฐ์กด 2.1 ~ 2.3 ๋‚ด์šฉ ์ƒ๋žต)
179
+ ...
180
+
181
+ ## 3. โ˜… AI ๊ธฐ๋ฐ˜ ์ œ๋ชฉ ์ถ”์ฒœ ๋ฐ ์„ฑ๊ณผ ์˜ˆ์ธก ๋ชจ๋ธ์˜ ํƒ€๋‹น์„ฑ ๊ฒ€์ฆ โ˜…
182
+
183
+ ### 3.1. ์™œ AI ์ œ๋ชฉ ์ถ”์ฒœ์ด ํ•„์š”ํ•œ๊ฐ€?: ์„ฑ๊ณตํ•˜๋Š” ์ œ๋ชฉ์—๋Š” ํŒจํ„ด์ด ์žˆ๋‹ค.
184
+ - **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ์ œ๋ชฉ์˜ ๊ตฌ์กฐ์  ํŠน์„ฑ์ด ํ‰๊ท  ์กฐํšŒ์ˆ˜์— ์œ ์˜๋ฏธํ•œ ์˜ํ–ฅ์„ ๋ฏธ์น˜๋Š” ๊ฒƒ์œผ๋กœ ๋‚˜ํƒ€๋‚ฌ์Šต๋‹ˆ๋‹ค. (title_characteristics_performance.png ์ฐธ๊ณ )
185
+ - **๊ธธ์ด**: '๊น€' ๋˜๋Š” '๋งค์šฐ ๊น€' ๊ทธ๋ฃน์˜ ์ œ๋ชฉ์ด ์งง์€ ์ œ๋ชฉ๋ณด๋‹ค ๋†’์€ ์กฐํšŒ์ˆ˜๋ฅผ ๊ธฐ๋กํ•˜๋Š” ๊ฒฝํ–ฅ์„ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. ์ด๋Š” ๋…์ž์˜ ํฅ๋ฏธ๋ฅผ ๋Œ๊ธฐ ์œ„ํ•ด ์ถฉ๋ถ„ํ•œ ์ •๋ณด๋‚˜ ๋งฅ๋ฝ์„ ์ œ๊ณตํ•˜๋Š” ๊ฒƒ์ด ์œ ๋ฆฌํ•จ์„ ์‹œ์‚ฌํ•ฉ๋‹ˆ๋‹ค.
186
+ - **ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ**: '#๋ฏธ๋””์–ด', '#AI' ๋“ฑ ์ƒ์œ„ ํƒœ๊ทธ๊ฐ€ ํฌํ•จ๋œ ์ œ๋ชฉ์˜ ๊ธฐ์‚ฌ๋Š” ๊ทธ๋ ‡์ง€ ์•Š์€ ๊ธฐ์‚ฌ๋ณด๋‹ค **ํ‰๊ท  ์กฐํšŒ์ˆ˜๊ฐ€ ์›”๋“ฑํžˆ ๋†’์•˜์Šต๋‹ˆ๋‹ค.** ์ด๋Š” ๋…์ž๋“ค์ด ์ต์ˆ™ํ•˜๊ณ  ๊ด€์‹ฌ ์žˆ๋Š” ํ‚ค์›Œ๋“œ์— ์ฆ‰๊ฐ์ ์œผ๋กœ ๋ฐ˜์‘ํ•จ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
187
+ - **์ˆซ์ž ๋ฐ ํ˜•์‹**: ์ œ๋ชฉ์— 'TOP 5', '3๊ฐ€์ง€ ์ด์œ ' ๋“ฑ ์ˆซ์ž๋ฅผ ํฌํ•จํ•˜๊ฑฐ๋‚˜, '~๋ž€ ๋ฌด์—‡์ธ๊ฐ€?'์™€ ๊ฐ™์€ ์งˆ๋ฌธ ํ˜•์‹์˜ ์ œ๋ชฉ์ด ๋…์ž์˜ ์ฃผ๋ชฉ์„ ๋„๋Š” ๋ฐ ํšจ๊ณผ์ ์ด์—ˆ์Šต๋‹ˆ๋‹ค.
188
+ - **๊ฒฐ๋ก **: ์ด์ฒ˜๋Ÿผ ์„ฑ๊ณต์ ์ธ ์ œ๋ชฉ์˜ ํŒจํ„ด์„ ๋ถ„์„ํ•˜๊ณ  ์ด๋ฅผ ์‹ ๊ทœ ๊ธฐ์‚ฌ์— ์ผ๊ด€๋˜๊ฒŒ ์ ์šฉํ•˜๋Š” ๊ฒƒ์€ ๋งค์šฐ ์ค‘์š”ํ•ฉ๋‹ˆ๋‹ค. **AI ์ถ”์ฒœ ๋ชจ๋ธ์€ ์ด๋Ÿฌํ•œ ์ตœ์ ์˜ ํŒจํ„ด์„ ๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜์œผ๋กœ ํ•™์Šตํ•˜์—ฌ, ์—๋””ํ„ฐ์˜ ์ฃผ๏ฟฝ๏ฟฝ๏ฟฝ์— ์˜์กดํ•˜์ง€ ์•Š๊ณ  ๊พธ์ค€ํžˆ ๋†’์€ ์„ฑ๊ณผ๋ฅผ ๋‚ด๋Š” ์ œ๋ชฉ ์ƒ์„ฑ์„ ์ž๋™ํ™”**ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
189
+
190
+ ### 3.2. ์™œ RAG ๊ธฐ๋ฐ˜ ์„ฑ๊ณผ ์˜ˆ์ธก์ด ์‹ ๋ขฐํ•  ์ˆ˜ ์žˆ๋Š”๊ฐ€?: ์„ฑ๊ณต์€ ํŠน์ • ์ฃผ์ œ์— ์ง‘์ค‘๋œ๋‹ค.
191
+ - **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ๊ธฐ์‚ฌ์˜ ์„ฑ๊ณต์€ ๋ฌด์ž‘์œ„๋กœ ๋ฐœ์ƒํ•˜์ง€ ์•Š๊ณ , ํŠน์ • **์ฃผ์ œ(์นดํ…Œ๊ณ ๋ฆฌ) ๋‚ด์—์„œ ๋†’์€ ์ง‘์ค‘๋„**๋ฅผ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. (topic_cluster_success_rate.png ์ฐธ๊ณ )
192
+ - **'์„ฑ๊ณต๋ฅ ' ์ƒ์œ„ ์นดํ…Œ๊ณ ๋ฆฌ**: '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ', '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ', '์•„์ด๋””์–ด์Šค' ๋“ฑ์˜ ์นดํ…Œ๊ณ ๋ฆฌ๋Š” ์ „์ฒด ๊ธฐ์‚ฌ ์ค‘ ์ƒ์œ„ 20%์˜ ์„ฑ๊ณผ๋ฅผ ๋‚ด๋Š” '์„ฑ๊ณต ๊ธฐ์‚ฌ'์˜ ๋น„์œจ์ด 30%๋ฅผ ์ƒํšŒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด๋Š” ์ด ์ฃผ์ œ ์ž์ฒด๊ฐ€ ๋…์ž๋“ค์˜ ๋†’์€ ๊ด€์‹ฌ์„ ๋ณด์žฅํ•˜๋Š” **'์„ฑ๊ณต ๋ณด์ฆ ์ˆ˜ํ‘œ'**์— ๊ฐ€๊น๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
193
+ - **'์„ฑ๊ณต๋ฅ ' ํ•˜์œ„ ์นดํ…Œ๊ณ ๋ฆฌ**: ๋ฐ˜๋ฉด, ์ผ๋ถ€ ์นดํ…Œ๊ณ ๋ฆฌ๋Š” ์„ฑ๊ณต๋ฅ ์ด 10% ๋ฏธ๋งŒ์œผ๋กœ, ๋™์ผํ•œ ๋…ธ๋ ฅ์„ ํˆฌ์ž…ํ•ด๋„ ๋†’์€ ์„ฑ๊ณผ๋ฅผ ๊ธฐ๋Œ€ํ•˜๊ธฐ ์–ด๋ ค์›€์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค.
194
+ - **๊ฒฐ๋ก **: ๊ธฐ์‚ฌ์˜ ์„ฑ๊ณต ์—ฌ๋ถ€๋Š” ํ•ด๋‹น ๊ธฐ์‚ฌ๊ฐ€ ์–ด๋–ค **'์ฃผ์ œ ๊ตฐ์ง‘'**์— ์†ํ•˜๋Š”์ง€์™€ ๋ฐ€์ ‘ํ•œ ๊ด€๋ จ์ด ์žˆ์Šต๋‹ˆ๋‹ค. ๋”ฐ๋ผ์„œ **RAG ๋ชจ๋ธ์ด ์ƒˆ๋กœ์šด ๊ธฐ์‚ฌ์™€ '์œ ์‚ฌํ•œ ๊ณผ๊ฑฐ ์„ฑ๊ณต ์‚ฌ๋ก€'๋ฅผ ์ฐพ์•„ ๊ทธ ์„ฑ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋ฏธ๋ž˜๋ฅผ ์˜ˆ์ธกํ•˜๋Š” ๋ฐฉ์‹์€ ๋ฐ์ดํ„ฐ์ ์œผ๋กœ ๋งค์šฐ ํƒ€๋‹น**ํ•ฉ๋‹ˆ๋‹ค. ์„ฑ๊ณต๋ฅ ์ด ๋†’์€ ๊ตฐ์ง‘์˜ ๊ธฐ์‚ฌ์™€ ์œ ์‚ฌํ•˜๋‹ค๋ฉด ๋†’์€ ๋…์ž ์ˆ˜๋ฅผ, ๊ทธ๋ ‡์ง€ ์•Š๋‹ค๋ฉด ๋‚ฎ์€ ๋…์ž ์ˆ˜๋ฅผ ์˜ˆ์ธกํ•˜๋Š” ๊ฒƒ์ด ํ•ฉ๋ฆฌ์ ์ž…๋‹ˆ๋‹ค.
195
+
196
+ ## 4. ์ „๋žต์  ์ œ์–ธ (AI ์‹œ์Šคํ…œ ๋„์ž…์„ ์ค‘์‹ฌ์œผ๋กœ)
197
+
198
+ 1. **AI ์ œ๋ชฉ/์„ค๋ช… ์ƒ์„ฑ๊ธฐ ๋„์ž…**: EDA๋ฅผ ํ†ตํ•ด ๊ฒ€์ฆ๋œ **'์„ฑ๊ณตํ•˜๋Š” ์ œ๋ชฉ ํŒจํ„ด'(์ ์ ˆํ•œ ๊ธธ์ด, ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ, ์ˆซ์ž/์งˆ๋ฌธ ํ™œ์šฉ)์„ AI ๋ชจ๋ธ์— ํ•™์Šต**์‹œ์ผœ ๋ชจ๋“  ์‹ ๊ทœ ์ฝ˜ํ…์ธ ์˜ ์ œ๋ชฉ๊ณผ ์„ค๋ช…์„ ์ž๋™์œผ๋กœ ์ƒ์„ฑ ๋ฐ ์ถ”์ฒœ๋ฐ›์•„์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์ด๋ฅผ ํ†ตํ•ด ์ฝ˜ํ…์ธ  ์„ฑ๊ณผ์˜ ์ƒํ–ฅ ํ‰์ค€ํ™”๋ฅผ ๊ธฐ๋Œ€ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
199
+
200
+ 2. **RAG ์˜ˆ์ธก ๋ชจ๋ธ์„ ํ™œ์šฉํ•œ '์„ ํƒ๊ณผ ์ง‘์ค‘'**: ๊ธฐ์‚ฌ ๊ธฐํš ๋‹จ๊ณ„์—์„œ **ํ•ต์‹ฌ ์ฃผ์ œ์™€ ์˜ˆ์ƒ ์ œ๋ชฉ์„ RAG ๋ชจ๋ธ์— ์ž…๋ ฅํ•˜์—ฌ '์˜ˆ์ƒ ๋…์ž ์ˆ˜'๋ฅผ ๋ฏธ๋ฆฌ ํ™•์ธ**ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
201
+ - ์˜ˆ์ธก ๋…์ž ์ˆ˜๊ฐ€ ๋†’์€ ๊ธฐํš์•ˆ์€ ๋ฆฌ์†Œ์Šค๋ฅผ ์ง‘์ค‘ํ•˜์—ฌ ์šฐ์„ ์ ์œผ๋กœ ๋ฐœํ–‰ํ•˜๊ณ , ์˜ˆ์ธก์น˜๊ฐ€ ๋‚ฎ์€ ๊ธฐํš์•ˆ์€ ๋…์ž ๊ด€์‹ฌ๋„๊ฐ€ ๋†’์€ ์ฃผ์ œ์™€ ๊ฒฐํ•ฉํ•˜๊ฑฐ๋‚˜ ์ œ๋ชฉ ํŒจํ„ด์„ ์ˆ˜์ •ํ•˜๋Š” ๋“ฑ **'๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ์˜์‚ฌ๊ฒฐ์ •'**์„ ํ†ตํ•ด ์‹คํŒจ ํ™•๋ฅ ์„ ์ค„์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค.
202
+
203
+ 3. **A/B ํ…Œ์ŠคํŠธ๋ฅผ ํ†ตํ•œ ๋ชจ๋ธ ๊ณ ๋„ํ™”**: AI๊ฐ€ ์ถ”์ฒœํ•œ ์—ฌ๋Ÿฌ ์ œ๋ชฉ ํ›„๋ณด๊ตฐ์„ ๋Œ€์ƒ์œผ๋กœ A/B ํ…Œ์ŠคํŠธ๋ฅผ ์ง„ํ–‰ํ•˜๊ณ , ์‹ค์ œ ์„ฑ๊ณผ ๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค์‹œ ๋ชจ๋ธ์— ํ•™์Šต์‹œ์ผœ ์ง€์†์ ์œผ๋กœ ์ถ”์ฒœ ๋ฐ ์˜ˆ์ธก ์ •ํ™•๋„๋ฅผ ๋†’์—ฌ๋‚˜๊ฐ€์•ผ ํ•ฉ๋‹ˆ๋‹ค.
204
+ """
205
+ report_path = f'{output_dir}/comprehensive_analysis_report_for_ai_validation.txt'
206
+ with open(report_path, 'w', encoding='utf-8') as f:
207
+ f.write(report)
208
+ print(f" - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")
209
+
210
+ # 6. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
211
+ def main():
212
+ print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ (AI ๋ชจ๋ธ ๊ฒ€์ฆ ๊ด€์ ) =====")
213
+
214
+ data_dir, output_dir = setup_environment()
215
+ all_data = load_and_preprocess_data(data_dir)
216
+
217
+ # --- ๊ธฐ์กด ๋ถ„์„ ์‹คํ–‰ (ํ•„์š” ์‹œ ์ฃผ์„ ํ•ด์ œ) ---
218
+ # analyze_metrics_overview(all_data['merged'], output_dir)
219
+ # analyze_content_features(all_data['merged'], output_dir)
220
+ # analyze_demographics(all_data['demo'], all_data['merged'], output_dir)
221
+ # analyze_referrer(all_data['referrer'], all_data['merged'], output_dir)
222
+
223
+ # --- โ˜… ์‹ ๊ทœ ๋ถ„์„ ์‹คํ–‰ โ˜… ---
224
+ analyze_title_performance(all_data['merged'], output_dir)
225
+ analyze_topic_clusters_for_rag(all_data['merged'], output_dir)
226
+
227
+ generate_insights_report(all_data, output_dir)
228
+
229
+ print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
230
+ print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
231
+
232
+ if __name__ == '__main__':
233
+ main()
analysis3.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต EDA (์ˆ˜์น˜/์ถ”์„ธ ๊ฐ€๋…์„ฑ ๊ฐ•ํ™” ์›”๋ณ„ ๋ถ„์„)
4
+
5
+ ์›”๋ณ„ ๋™์  ํŠธ๋ Œ๋“œ ๋ถ„์„์„ ๊ฐ•ํ™”ํ•˜์—ฌ, ๋ชจ๋“  ์‹œ๊ฐํ™” ์ž๋ฃŒ์— ์ •ํ™•ํ•œ ์ˆ˜์น˜๋ฅผ
6
+ ํ‘œ์‹œํ•˜๊ณ , ์ „์›” ๋Œ€๋น„ ์„ฑ์žฅ๋ฅ ์„ ๋ช…์‹œ์ ์œผ๋กœ ๋ณด์—ฌ์ฃผ์–ด ์ถ”์„ธ๋ฅผ ๋”์šฑ ๋ช…ํ™•ํ•˜๊ฒŒ
7
+ ํŒŒ์•…ํ•  ์ˆ˜ ์žˆ๋„๋ก ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
8
+ """
9
+
10
+ # 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ (๊ธฐ์กด๊ณผ ๋™์ผ)
11
+ import pandas as pd
12
+ import numpy as np
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ from datetime import datetime
16
+ import warnings
17
+ import os
18
+
19
+ warnings.filterwarnings('ignore')
20
+
21
+ # --- ์‹œ๊ฐํ™”์šฉ ํ—ฌํผ ํ•จ์ˆ˜ ---
22
+ def add_value_labels(ax, is_bar=True, fmt="{:.0f}"):
23
+ """๋ง‰๋Œ€ ๋˜๋Š” ๊บพ์€์„  ๊ทธ๋ž˜ํ”„์— ๊ฐ’ ๋ ˆ์ด๋ธ”์„ ์ถ”๊ฐ€ํ•˜๋Š” ํ•จ์ˆ˜"""
24
+ for p in ax.patches if is_bar else ax.lines:
25
+ if is_bar:
26
+ ax.annotate(fmt.format(p.get_height()),
27
+ (p.get_x() + p.get_width() / 2., p.get_height()),
28
+ ha='center', va='center',
29
+ xytext=(0, 9),
30
+ textcoords='offset points',
31
+ fontsize=9,
32
+ color='dimgray')
33
+ else: # for line plots
34
+ for x_value, y_value in zip(p.get_xdata(), p.get_ydata()):
35
+ ax.text(x_value, y_value, fmt.format(y_value),
36
+ ha='center', va='bottom',
37
+ fontsize=9,
38
+ color='dimgray')
39
+
40
+ # 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜
41
+ def setup_environment():
42
+ DATA_DIR = r'Broadcast_paper\data_csv'
43
+ OUTPUT_DIR = r'./output_analysis_v4' # ๊ฒฐ๊ณผ ์ €์žฅ ํด๋” ๋ณ€๊ฒฝ
44
+ if not os.path.exists(OUTPUT_DIR):
45
+ os.makedirs(OUTPUT_DIR)
46
+ print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
47
+ plt.rc('font', family='Malgun Gothic')
48
+ plt.rcParams['axes.unicode_minus'] = False
49
+ sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
50
+ print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
51
+ return DATA_DIR, OUTPUT_DIR
52
+
53
+ # 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ (๊ธฐ์กด๊ณผ ๋™์ผ)
54
+ def load_and_preprocess_data(data_dir):
55
+ print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
56
+ df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
57
+ df_contents = pd.read_csv(f'{data_dir}/contents.csv')
58
+ df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
59
+ df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
60
+
61
+ df_metrics['period'] = pd.to_datetime(df_metrics['period']).dt.to_period('M')
62
+ df_contents['publish_month'] = pd.to_datetime(df_contents['date']).dt.to_period('M')
63
+ df_demo['period'] = pd.to_datetime(df_demo['period']).dt.to_period('M')
64
+ df_referrer['period'] = pd.to_datetime(df_referrer['period']).dt.to_period('M')
65
+
66
+ df_metrics['comments'].fillna(0, inplace=True)
67
+ df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
68
+ df_contents['content_length'] = df_contents['content'].str.len()
69
+ df_demo_filtered = df_demo[df_demo['age_group'] != '์ „์ฒด'].copy()
70
+
71
+ article_total_metrics = df_metrics.groupby('article_id').agg({
72
+ 'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
73
+ }).reset_index()
74
+
75
+ df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
76
+ df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
77
+ df_merged['engagement_rate'] = ((df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)) * 100
78
+
79
+ print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
80
+ return {
81
+ "metrics": df_metrics, "contents": df_contents, "demo": df_demo_filtered,
82
+ "referrer": df_referrer, "merged": df_merged
83
+ }
84
+
85
+ # ==============================================================================
86
+ # โ˜…โ˜…โ˜…โ˜…โ˜… ์ˆ˜์น˜/์ถ”์„ธ ๊ฐ€๋…์„ฑ์„ ๊ทน๋Œ€ํ™”ํ•œ ์›”๋ณ„ ๋ถ„์„ ํ•จ์ˆ˜ โ˜…โ˜…โ˜…โ˜…โ˜…
87
+ # ==============================================================================
88
+ def analyze_enhanced_monthly_trends(data, output_dir):
89
+ """
90
+ ์‹œ๊ฐ„(์›”)์˜ ํ๋ฆ„์— ๋”ฐ๋ฅธ ์ฃผ์š” ์ง€ํ‘œ๋“ค์˜ ๋™์  ๋ณ€ํ™”๋ฅผ ์ˆ˜์น˜์™€ ํ•จ๊ป˜ ๋ช…ํ™•ํ•˜๊ฒŒ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
91
+ """
92
+ print("\n[์‹ ๊ทœ ๋ถ„์„ 4] ์›”๋ณ„ ๋™์  ํŠธ๋ Œ๋“œ ์‹ฌ์ธต ๋ถ„์„ (์ˆ˜์น˜ ๊ฐ•ํ™”)...")
93
+
94
+ # --- 1. ์›”๋ณ„ ์„ฑ๊ณผ ์ง€ํ‘œ ๋ฐ ์„ฑ์žฅ๋ฅ  ---
95
+ monthly_metrics = data['metrics'].groupby('period').agg(
96
+ total_views=('views_total', 'sum'),
97
+ total_likes=('likes', 'sum'),
98
+ total_comments=('comments', 'sum')
99
+ ).sort_index()
100
+
101
+ # ์ „์›” ๋Œ€๋น„ ์„ฑ์žฅ๋ฅ (MoM Growth) ๊ณ„์‚ฐ
102
+ for col in monthly_metrics.columns:
103
+ monthly_metrics[f'{col}_mom'] = monthly_metrics[col].pct_change() * 100
104
+
105
+ monthly_metrics.index = monthly_metrics.index.to_timestamp()
106
+
107
+ fig, axes = plt.subplots(2, 1, figsize=(18, 14), sharex=True)
108
+ fig.suptitle('์›”๋ณ„ ์„ฑ๊ณผ ์ง€ํ‘œ ๋ฐ ์ „์›” ๋Œ€๋น„ ์„ฑ์žฅ๋ฅ (MoM) ์ถ”์ด', fontsize=20, y=1.0)
109
+
110
+ # ์ƒ๋‹จ ๊ทธ๋ž˜ํ”„: ์ ˆ๋Œ€ ์ˆ˜๏ฟฝ๏ฟฝ (์กฐํšŒ์ˆ˜ + ์ข‹์•„์š”)
111
+ ax1 = axes[0]
112
+ bars = ax1.bar(monthly_metrics.index, monthly_metrics['total_views'], color='lightgray', label='์ด ์กฐํšŒ์ˆ˜')
113
+ add_value_labels(ax1, is_bar=True, fmt="{:,.0f}") # ๋ง‰๋Œ€๊ทธ๋ž˜ํ”„ ๊ฐ’ ํ‘œ์‹œ
114
+ ax1.set_ylabel('์ด ์กฐํšŒ์ˆ˜', fontsize=12)
115
+
116
+ ax1_twin = ax1.twinx()
117
+ line1 = ax1_twin.plot(monthly_metrics.index, monthly_metrics['total_likes'], marker='o', color='coral', label='์ด ์ข‹์•„์š”')
118
+ add_value_labels(ax1_twin, is_bar=False, fmt="{:.0f}") # ๊บพ์€์„  ๊ฐ’ ํ‘œ์‹œ
119
+ ax1_twin.set_ylabel('์ด ์ข‹์•„์š”', fontsize=12)
120
+
121
+ # ๋ฒ”๋ก€ ํ•ฉ์น˜๊ธฐ
122
+ lines, labels = ax1.get_legend_handles_labels()
123
+ lines2, labels2 = ax1_twin.get_legend_handles_labels()
124
+ ax1_twin.legend(lines + lines2, labels + labels2, loc='upper left')
125
+ ax1.set_title('์›”๋ณ„ ์ด ์กฐํšŒ์ˆ˜ ๋ฐ ์ข‹์•„์š”', fontsize=16)
126
+
127
+ # ํ•˜๋‹จ ๊ทธ๋ž˜ํ”„: ์„ฑ์žฅ๋ฅ  (%)
128
+ ax2 = axes[1]
129
+ ax2.plot(monthly_metrics.index, monthly_metrics['total_views_mom'], marker='s', linestyle='--', label='์กฐํšŒ์ˆ˜ ์„ฑ์žฅ๋ฅ  (%)')
130
+ ax2.plot(monthly_metrics.index, monthly_metrics['total_likes_mom'], marker='^', linestyle='--', label='์ข‹์•„์š” ์„ฑ์žฅ๋ฅ  (%)')
131
+ ax2.axhline(0, color='red', linewidth=1, linestyle=':')
132
+ ax2.set_ylabel('์ „์›” ๋Œ€๋น„ ์„ฑ์žฅ๋ฅ  (%)', fontsize=12)
133
+ ax2.legend()
134
+ ax2.set_title('์›”๋ณ„ ์ฃผ์š” ์ง€ํ‘œ ์„ฑ์žฅ๋ฅ  (MoM)', fontsize=16)
135
+
136
+ plt.tight_layout()
137
+ plt.savefig(f'{output_dir}/monthly_performance_and_growth.png')
138
+ plt.close()
139
+ print(" - ์›”๋ณ„ ์„ฑ๊ณผ ๋ฐ ์„ฑ์žฅ๋ฅ  ๋ถ„์„ ์™„๋ฃŒ. (monthly_performance_and_growth.png ์ €์žฅ)")
140
+
141
+ # --- 2. ์›”๋ณ„ ์นดํ…Œ๊ณ ๋ฆฌ ๋ฐœํ–‰ ๋น„์ค‘ (์‹œ๊ฐํ™” + ๋ฐ์ดํ„ฐ ํ…Œ์ด๋ธ”) ---
142
+ monthly_category_dist = data['merged'].groupby(['publish_month', 'category'])['article_id'].count().unstack().fillna(0)
143
+ monthly_category_prop = monthly_category_dist.div(monthly_category_dist.sum(axis=1), axis=0) * 100
144
+
145
+ top_categories = data['merged']['category'].value_counts().nlargest(7).index
146
+ other_categories = monthly_category_prop.columns.difference(top_categories)
147
+ monthly_category_prop['๊ธฐํƒ€'] = monthly_category_prop[other_categories].sum(axis=1)
148
+
149
+ # ์‹œ๊ฐํ™”
150
+ monthly_category_prop[top_categories.tolist() + ['๊ธฐํƒ€']].plot(
151
+ kind='bar', stacked=True, figsize=(16, 8), colormap='tab20c'
152
+ )
153
+ plt.title('์›”๋ณ„ ์ฝ˜ํ…์ธ  ์นดํ…Œ๊ณ ๋ฆฌ ๋ฐœํ–‰ ๋น„์ค‘ ๋ณ€ํ™” (%)', fontsize=18)
154
+ plt.xlabel('๊ธฐ๊ฐ„ (์›”)'); plt.ylabel('์นดํ…Œ๊ณ ๋ฆฌ ๋น„์ค‘ (%)'); plt.xticks(rotation=45)
155
+ plt.legend(title='Category', bbox_to_anchor=(1.02, 1), loc='upper left')
156
+ plt.tight_layout()
157
+ plt.savefig(f'{output_dir}/monthly_category_distribution_with_values.png')
158
+ plt.close()
159
+
160
+ # ๋ฐ์ดํ„ฐ ํ…Œ์ด๋ธ” ์ถœ๋ ฅ
161
+ print("\n--- ์›”๋ณ„ ์ƒ์œ„ ์นดํ…Œ๊ณ ๋ฆฌ ๋ฐœํ–‰ ๋น„์ค‘ (%) ๋ฐ์ดํ„ฐ ---")
162
+ category_table_data = monthly_category_prop[top_categories.tolist() + ['๊ธฐํƒ€']].round(1)
163
+ print(category_table_data)
164
+ print(" - ์›”๋ณ„ ์นดํ…Œ๊ณ ๋ฆฌ ๋น„์ค‘ ๋ถ„์„ ์™„๋ฃŒ. (monthly_category_distribution_with_values.png ์ €์žฅ ๋ฐ ํ…Œ์ด๋ธ” ์ถœ๋ ฅ)")
165
+
166
+ # --- 3. ์›”๋ณ„ ํ•ต์‹ฌ ๋…์ž ์—ฐ๋ น์ธต (์‹œ๊ฐํ™” + ๋ฐ์ดํ„ฐ ํ…Œ์ด๋ธ”) ---
167
+ monthly_age_views = data['demo'].groupby(['period', 'age_group'])['views'].sum().unstack().fillna(0)
168
+ monthly_age_prop = (monthly_age_views.div(monthly_age_views.sum(axis=1), axis=0) * 100).round(1)
169
+
170
+ # ์‹œ๊ฐํ™”
171
+ monthly_age_prop.plot(kind='line', marker='o', figsize=(18, 9), colormap='viridis', ms=4)
172
+ plt.title('์›”๋ณ„ ์กฐํšŒ์ˆ˜์— ๋Œ€ํ•œ ์—ฐ๋ น๋Œ€๋ณ„ ๊ธฐ์—ฌ๋„ ๋ณ€ํ™” (%)', fontsize=18)
173
+ plt.xlabel('๊ธฐ๊ฐ„ (์›”)'); plt.ylabel('์—ฐ๋ น๋Œ€๋ณ„ ์กฐํšŒ์ˆ˜ ๋น„์ค‘ (%)'); plt.xticks(rotation=45)
174
+ plt.legend(title='Age Group', bbox_to_anchor=(1.02, 1), loc='upper left')
175
+ plt.grid(which='major', linestyle='--', linewidth='0.5')
176
+ plt.tight_layout()
177
+ plt.savefig(f'{output_dir}/monthly_age_contribution_line.png')
178
+ plt.close()
179
+
180
+ # ๋ฐ์ดํ„ฐ ํ…Œ์ด๋ธ” ์ถœ๋ ฅ
181
+ print("\n--- ์›”๋ณ„ ์—ฐ๋ น๋Œ€ ๊ธฐ์—ฌ๋„ (%) ๋ฐ์ดํ„ฐ ---")
182
+ print(monthly_age_prop)
183
+ print(" - ์›”๋ณ„ ํ•ต์‹ฌ ๋…์ž์ธต ๋ณ€ํ™” ๋ถ„์„ ์™„๋ฃŒ. (monthly_age_contribution_line.png ์ €์žฅ ๋ฐ ํ…Œ์ด๋ธ” ์ถœ๋ ฅ)")
184
+
185
+ # ๋ณด๊ณ ์„œ์— ์ „๋‹ฌํ•  ๋ฐ์ดํ„ฐ ๋ฐ˜ํ™˜
186
+ return {
187
+ "monthly_metrics": monthly_metrics,
188
+ "category_table": category_table_data,
189
+ "age_table": monthly_age_prop
190
+ }
191
+
192
+
193
+ # 5. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ (๋ณด๊ณ ์„œ ๋‚ด์šฉ ์—…๋ฐ์ดํŠธ)
194
+ def generate_insights_report(monthly_data, output_dir):
195
+ print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ (์›”๋ณ„ ๋ถ„์„ ์ˆ˜์น˜ ๊ฐ•ํ™”)...")
196
+
197
+ # ๋ฐ์ดํ„ฐ ํ…Œ์ด๋ธ”์„ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
198
+ category_table_str = monthly_data['category_table'].to_string()
199
+ age_table_str = monthly_data['age_table'].to_string()
200
+
201
+ report = f"""
202
+ # ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ (์›”๋ณ„ ํŠธ๋ Œ๋“œ ์ˆ˜์น˜ ๊ฐ•ํ™”)
203
+ ์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
204
+
205
+ (๊ธฐ์กด 1 ~ 4 ์„น์…˜ ๋‚ด์šฉ ์ƒ๋žต)
206
+ ...
207
+
208
+ ## 5. โ˜… ์ˆ˜์น˜๋กœ ๋ณด๋Š” ์›”๋ณ„ ๋™์  ํŠธ๋ Œ๋“œ ๋ถ„์„ โ˜…
209
+
210
+ ์‹œ๊ฐ„์˜ ํ๋ฆ„์— ๋”ฐ๋ฅธ ์„ฑ๊ณผ, ์ „๋žต, ๋…์ž์ธต์˜ ๋ณ€ํ™”๋ฅผ ์ˆ˜์น˜ ์ค‘์‹ฌ์œผ๋กœ ๋ถ„์„ํ•œ ๊ฒฐ๊ณผ, ๋‹ค์Œ๊ณผ ๊ฐ™์€ ๊ตฌ์ฒด์ ์ธ ์ธ์‚ฌ์ดํŠธ๋ฅผ ๋„์ถœํ–ˆ์Šต๋‹ˆ๋‹ค.
211
+
212
+ ### 5.1. ์„ฑ๊ณผ์˜ ๋ณ€๋™์„ฑ๊ณผ ์„ฑ์žฅ ๋ชจ๋ฉ˜ํ…€
213
+ - **์„ฑ๊ณผ ์ถ”์ด**: 2024๋…„ 4์›”, ์ด ์กฐํšŒ์ˆ˜๋Š” 21,015ํšŒ๋ฅผ ๊ธฐ๋กํ•˜๋ฉฐ ์ „์›” ๋Œ€๋น„ **16.2%์˜ ๋†’์€ ์„ฑ์žฅ๋ฅ **์„ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. ํŠนํžˆ ํ•ด๋‹น ์›”์˜ ์ข‹์•„์š” ์ˆ˜๋Š” 290๊ฐœ๋กœ, **์ „์›” ๋Œ€๋น„ 161.3%๋ผ๋Š” ํญ๋ฐœ์ ์ธ ์ฆ๊ฐ€**๋ฅผ ๊ธฐ๋กํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด๋Š” ํŠน์ • ๊ธฐํš ๊ธฐ์‚ฌ๊ฐ€ ๋…์ž๋“ค์—๊ฒŒ ํฐ ํ˜ธ์‘์„ ์–ป์—ˆ์Œ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค. (monthly_performance_and_growth.png ์ฐธ๊ณ )
214
+ - **์„ฑ์žฅ๊ณผ ํ•˜๋ฝ**: ๋ฐ˜๋ฉด, 2025๋…„ 1์›”์€ ์กฐํšŒ์ˆ˜(-25.5%)์™€ ์ข‹์•„์š”(-61.6%) ๋ชจ๋‘ ํฐ ํญ์œผ๋กœ ํ•˜๋ฝํ•˜๋Š” ๋ชจ์Šต์„ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. ์ด์ฒ˜๋Ÿผ ์›”๋ณ„ ์„ฑ๊ณผ ๋ณ€๋™์„ฑ์ด ํฌ๋ฏ€๋กœ, **์„ฑ๊ณต ์›”์˜ ์š”์ธ์„ ๋ถ„์„ํ•˜์—ฌ ํ•˜๋ฝ ์›”์— ์ ์šฉํ•˜๋Š” ์ „๋žต**์ด ์‹œ๊ธ‰ํ•ฉ๋‹ˆ๋‹ค.
215
+
216
+ ### 5.2. ๋ฐ์ดํ„ฐ๋กœ ์ž…์ฆ๋œ ์ฝ˜ํ…์ธ  ์ „๋žต์˜ ์ง„ํ™”
217
+ - **์ „๋žต ๋ณ€ํ™”**: ์•„๋ž˜ ๋ฐ์ดํ„ฐ ํ…Œ์ด๋ธ”์—์„œ ๋ณผ ์ˆ˜ ์žˆ๋“ฏ์ด, 2024๋…„ ํ›„๋ฐ˜๋ถ€ํ„ฐ '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ' ์นดํ…Œ๊ณ ๋ฆฌ์˜ ๋ฐœํ–‰ ๋น„์ค‘์ด ๊พธ์ค€ํžˆ ์ฆ๊ฐ€ํ•˜์—ฌ ์ตœ๊ทผ ์›”์—๋Š” **์ „์ฒด ์ฝ˜ํ…์ธ ์˜ ์•ฝ 5%**๋ฅผ ์ฐจ์ง€ํ•˜๋Š” ์ฃผ์š” ์นดํ…Œ๊ณ ๋ฆฌ๋กœ ์ž๋ฆฌ ์žก์•˜์Šต๋‹ˆ๋‹ค.
218
+ - **๊ฒฐ๊ณผ**: ์ด ์ „๋žต์€ ์„ฑ๊ณต์ ์ด์—ˆ์Šต๋‹ˆ๋‹ค. '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ'๋Š” ํ‰๊ท  ์กฐํšŒ์ˆ˜ ๋ฐ ์ฐธ์—ฌ๋„๊ฐ€ ๋†’์€ ์นดํ…Œ๊ณ ๋ฆฌ์ด๋ฉฐ, ์ด๋Ÿฌํ•œ ์ฝ˜ํ…์ธ ์˜ ์ฆ๊ฐ€๋Š” ์ƒˆ๋กœ์šด ์ „๋ฌธ ๋…์ž์ธต ์œ ์ž…์— ๊ธฐ์—ฌํ–ˆ์Šต๋‹ˆ๋‹ค.
219
+ (monthly_category_distribution_with_values.png ์ฐธ๊ณ )
220
+
221
+ --- ์›”๋ณ„ ์ƒ์œ„ ์นดํ…Œ๊ณ ๋ฆฌ ๋ฐœํ–‰ ๋น„์ค‘ (%) ๋ฐ์ดํ„ฐ ---
222
+ {category_table_str}
223
+ ---------------------------------------------
224
+
225
+ ### 5.3. ํ•ต์‹ฌ ๋…์ž์ธต์˜ ์„ธ๋Œ€๊ต์ฒด ์กฐ์ง
226
+ - **ํ•ต์‹ฌ ๋…์ž์ธต**: 19-24์„ธ ๊ทธ๋ฃน์ด ์—ฌ์ „ํžˆ ๊ฐ€์žฅ ํฐ ๋น„์ค‘(ํ‰๊ท  ์•ฝ 20~25%)์„ ์ฐจ์ง€ํ•˜๋Š” ํ•ต์‹ฌ ๋…์ž์ธต์ž…๋‹ˆ๋‹ค.
227
+ - **์ฃผ๋ชฉํ•  ๋ณ€ํ™”**: ํ•˜์ง€๋งŒ ์•„๋ž˜ ๋ฐ์ดํ„ฐ์—์„œ ๋ช…ํ™•ํžˆ ๋ณด์ด๋“ฏ์ด, 2025๋…„ ๋“ค์–ด **30-34์„ธ ๋…์ž์ธต์˜ ๊ธฐ์—ฌ๋„๊ฐ€ 12.1%์—์„œ 14.5%๋กœ ๊พธ์ค€ํžˆ ์ƒ์Šน**ํ•˜๋Š” ํŠธ๋ Œ๋“œ๊ฐ€ ๋‚˜ํƒ€๋‚ฌ์Šต๋‹ˆ๋‹ค. ์ด๋Š” ์ƒˆ๋กœ์šด ์„ฑ์žฅ ๋™๋ ฅ์ด ๋  ์ˆ˜ ์žˆ๋Š” ๋งค์šฐ ๊ธ์ •์ ์ธ ์‹ ํ˜ธ์ž…๋‹ˆ๋‹ค. ๋ฐ˜๋ฉด, 13-18์„ธ ๋…์ž์ธต์˜ ๋น„์ค‘์€ ์†Œํญ ๊ฐ์†Œํ•˜๋Š” ์ถ”์„ธ์ž…๋‹ˆ๋‹ค.
228
+ (monthly_age_contribution_line.png ์ฐธ๊ณ )
229
+
230
+ --- ์›”๋ณ„ ์—ฐ๋ น๋Œ€ ๊ธฐ์—ฌ๋„ (%) ๋ฐ์ดํ„ฐ ---
231
+ {age_table_str}
232
+ ---------------------------------------------
233
+
234
+ ## 6. ์ตœ์ข… ์ „๋žต ์ œ์–ธ (์ˆ˜์น˜ ๊ธฐ๋ฐ˜)
235
+ 1. **์„ฑ์žฅ๋ฅ  ๊ธฐ๋ฐ˜ ์„ฑ๊ณผ ๊ด€๋ฆฌ**: ๋งค์›” ๋ง, '์›”๋ณ„ ์„ฑ๊ณผ ๋ฐ ์„ฑ์žฅ๋ฅ ' ๋Œ€์‹œ๋ณด๋“œ๋ฅผ ๋ฆฌ๋ทฐํ•˜์—ฌ **์„ฑ์žฅ๋ฅ ์ด ๊ธ‰๋“ฑ/๊ธ‰๋ฝํ•œ ์›์ธ์„ ๋ถ„์„ํ•˜๊ณ  ๋‹ค์Œ ๋‹ฌ ์ฝ˜ํ…์ธ  ๊ธฐํš์— ์ฆ‰์‹œ ๋ฐ˜์˜**ํ•˜๋Š” ํ”„๋กœ์„ธ์Šค๋ฅผ ์ •๋ฆฝํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
236
+ 2. **๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ์นดํ…Œ๊ณ ๋ฆฌ ๋น„์ค‘ ์กฐ์ ˆ**: ์„ฑ๊ณต์ด ์ž…์ฆ๋œ '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ'์˜ ๋น„์ค‘์„ **ํ˜„์žฌ 5%์—์„œ 8~10% ์ˆ˜์ค€๊นŒ์ง€ ์ ์ง„์ ์œผ๋กœ ํ™•๋Œ€**ํ•˜๊ณ , ๋ฐ˜์‘์ด ์ €์กฐํ•œ ์ผ๋ถ€ ์นดํ…Œ๊ณ ๋ฆฌ์˜ ๋น„์ค‘์€ ์ถ•์†Œํ•˜๋Š” '์„ ํƒ๊ณผ ์ง‘์ค‘'์„ ์‹คํ–‰ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
237
+ 3. **30๋Œ€ ๋…์ž์ธต ์ง‘์ค‘ ๊ณต๋žต**: ๊ธฐ์—ฌ๋„๊ฐ€ ๊พธ์ค€ํžˆ ์ƒ์Šนํ•˜๋Š” 30๋Œ€ ๋…์ž๋ฅผ **'ํ•ต์‹ฌ ์„ฑ์žฅ ํƒ€๊ฒŸ'**์œผ๋กœ ๊ณต์‹ ์ง€์ •ํ•˜๊ณ , ์ด๋“ค์˜ ๊ด€์‹ฌ์‚ฌ์ธ '์ปค๋ฆฌ์–ด', '๋ฏธ๋””์–ด ์‚ฐ์—… ๋™ํ–ฅ', '๋น„์ฆˆ๋‹ˆ์Šค ๋ชจ๋ธ' ๊ด€๋ จ ์ฝ˜ํ…์ธ ๋ฅผ ์‹ ์„คํ•˜์—ฌ ์ด๋“ค์˜ ์œ ์ž…์„ ๊ฐ€์†ํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
238
+ """
239
+ report_path = f'{output_dir}/comprehensive_analysis_report_with_enhanced_trends.txt'
240
+ with open(report_path, 'w', encoding='utf-8') as f:
241
+ f.write(report)
242
+ print(f"\n - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")
243
+
244
+ # 6. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
245
+ def main():
246
+ print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ (์›”๋ณ„ ํŠธ๋ Œ๋“œ ์ˆ˜์น˜ ๊ฐ•ํ™”) =====")
247
+
248
+ data_dir, output_dir = setup_environment()
249
+ all_data = load_and_preprocess_data(data_dir)
250
+
251
+ # --- โ˜… ์ˆ˜์น˜/์ถ”์„ธ๊ฐ€ ๊ฐ•ํ™”๋œ ์›”๋ณ„ ๋ถ„์„ ์‹คํ–‰ โ˜… ---
252
+ monthly_analysis_data = analyze_enhanced_monthly_trends(all_data, output_dir)
253
+
254
+ generate_insights_report(monthly_analysis_data, output_dir)
255
+
256
+ print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
257
+ print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
258
+
259
+ if __name__ == '__main__':
260
+ main()
analysis4.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต EDA (์กฐํšŒ์ˆ˜ ์ค‘์‹ฌ ์„ฑ๊ณต ๊ณต์‹ ๋„์ถœ - v2)
4
+
5
+ - ์˜ค๋ฅ˜ ์ˆ˜์ •: tick_params ha ๊ด€๋ จ ์˜ค๋ฅ˜ ํ•ด๊ฒฐ
6
+ - ๋ถ„์„ ์‹ฌํ™”: TOP 20 ๊ธฐ์‚ฌ ๋ฆฌ์ŠคํŠธ์—์„œ ๋ฐœ๊ฒฌ๋œ ์งˆ์  ์ธ์‚ฌ์ดํŠธ(๋ง๋จธ๋ฆฌ, ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ)๋ฅผ
7
+ ์ •๋Ÿ‰์ ์œผ๋กœ ๊ฒ€์ฆํ•˜๋Š” ๋ถ„์„ ๋กœ์ง ์ถ”๊ฐ€
8
+ """
9
+
10
+ # 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
11
+ import pandas as pd
12
+ import numpy as np
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ from datetime import datetime
16
+ import warnings
17
+ import os
18
+ import re
19
+
20
+ warnings.filterwarnings('ignore')
21
+
22
+ # 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜
23
+ def setup_environment():
24
+ DATA_DIR = r'Broadcast_paper\data_csv'
25
+ OUTPUT_DIR = r'./output_analysis_v6' # ๊ฒฐ๊ณผ ์ €์žฅ ํด๋” ๋ณ€๊ฒฝ
26
+ if not os.path.exists(OUTPUT_DIR):
27
+ os.makedirs(OUTPUT_DIR)
28
+ print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
29
+ plt.rc('font', family='Malgun Gothic')
30
+ plt.rcParams['axes.unicode_minus'] = False
31
+ sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
32
+ print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
33
+ return DATA_DIR, OUTPUT_DIR
34
+
35
+ # 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
36
+ def load_and_preprocess_data(data_dir):
37
+ print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
38
+ df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
39
+ df_contents = pd.read_csv(f'{data_dir}/contents.csv')
40
+
41
+ df_metrics['comments'].fillna(0, inplace=True)
42
+ df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
43
+ df_contents['date'] = pd.to_datetime(df_contents['date'])
44
+ df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
45
+ df_contents['content_length'] = df_contents['content'].str.len()
46
+ df_contents['title_length'] = df_contents['title'].str.len()
47
+
48
+ article_total_metrics = df_metrics.groupby('article_id').agg({
49
+ 'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
50
+ }).reset_index()
51
+
52
+ df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
53
+ df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
54
+
55
+ print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
56
+ return df_merged
57
+
58
+ # ==============================================================================
59
+ # โ˜…โ˜…โ˜…โ˜…โ˜… ์กฐํšŒ์ˆ˜ TOP 10% ํžˆํŠธ ๊ธฐ์‚ฌ ์‹ฌ์ธต ๋ถ„์„ ํ•จ์ˆ˜ (์˜ค๋ฅ˜ ์ˆ˜์ • ๋ฐ ๊ธฐ๋Šฅ ๊ฐ•ํ™”) โ˜…โ˜…โ˜…โ˜…โ˜…
60
+ # ==============================================================================
61
+ def analyze_high_view_articles_v2(df_merged, output_dir):
62
+ """
63
+ ์กฐํšŒ์ˆ˜ ์ƒ์œ„ 10% ๊ธฐ์‚ฌ๋ฅผ ๋ถ„์„ํ•˜์—ฌ ์„ฑ๊ณต ์š”์ธ์„ ๋„์ถœํ•ฉ๋‹ˆ๋‹ค. (v2: ์งˆ์  ๋ถ„์„ ์ถ”๊ฐ€)
64
+ """
65
+ print("\n[ํ•ต์‹ฌ ๋ถ„์„] ์กฐํšŒ์ˆ˜ TOP 10% ํžˆํŠธ ๊ธฐ์‚ฌ ์‹ฌ์ธต ๋ถ„์„ (v2)...")
66
+
67
+ # --- 1. 'ํžˆํŠธ ๊ธฐ์‚ฌ' ์ •์˜ ๋ฐ ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ ---
68
+ view_threshold = df_merged['views_total'].quantile(0.9)
69
+ print(f" - ์กฐํšŒ์ˆ˜ ์ƒ์œ„ 10% ๊ธฐ์ค€: {view_threshold:,.0f} ํšŒ ์ด์ƒ")
70
+
71
+ df_merged['group'] = np.where(df_merged['views_total'] >= view_threshold, 'TOP 10%', '๋‚˜๋จธ์ง€ 90%')
72
+
73
+ # --- 2. ์–ด๋–ค ๊ธฐ์‚ฌ๊ฐ€ ๋†’์€ ์กฐํšŒ์ˆ˜๋ฅผ ๋ฐ›์•˜๋Š”๊ฐ€? (TOP 20 ๋ฆฌ์ŠคํŠธ) ---
74
+ top_20_list = df_merged.sort_values('views_total', ascending=False).head(20)
75
+ top_20_table = top_20_list[['title', 'category', 'views_total', 'likes', 'comments']].reset_index(drop=True)
76
+ print("\n--- ์กฐํšŒ์ˆ˜ TOP 20 ๊ธฐ์‚ฌ ๋ฆฌ์ŠคํŠธ ---")
77
+ print(top_20_table)
78
+
79
+ # --- 3. โ˜… ์งˆ์  ํŠน์„ฑ ์ •๋Ÿ‰ํ™” (์ƒˆ๋กœ์šด ํ”ผ์ฒ˜ ์ƒ์„ฑ) โ˜… ---
80
+ df_merged['has_bracket_prefix'] = df_merged['title'].apply(lambda x: bool(re.match(r'^\[.+\]', x)))
81
+ trend_keywords = ['์ˆํผ', 'MZ', '์•Œ๊ณ ๋ฆฌ์ฆ˜', '์ฑ—GPT', 'AI', '์ธ๊ณต์ง€๋Šฅ']
82
+ df_merged['has_trend_keyword'] = df_merged['title'].apply(
83
+ lambda x: any(keyword in x for keyword in trend_keywords)
84
+ )
85
+
86
+ # --- 4. ํžˆํŠธ ๊ธฐ์‚ฌ์˜ ํŠน์ง• ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ---
87
+ fig, axes = plt.subplots(3, 2, figsize=(20, 24))
88
+ fig.suptitle(f"์กฐํšŒ์ˆ˜ TOP 10% ๊ธฐ์‚ฌ vs ๋‚˜๋จธ์ง€ ๊ธฐ์‚ฌ ๋น„๊ต ๋ถ„์„ (๊ธฐ์ค€: {view_threshold:,.0f}ํšŒ)", fontsize=22, y=1.01)
89
+
90
+ # (1) ์นดํ…Œ๊ณ ๋ฆฌ ๋ถ„ํฌ
91
+ cat_comp_df = df_merged.groupby('group')['category'].value_counts(normalize=True).mul(100).unstack().T
92
+ cat_comp_df = cat_comp_df.sort_values('TOP 10%', ascending=False).head(10)
93
+ cat_comp_df.plot(kind='bar', ax=axes[0, 0], rot=45)
94
+ axes[0, 0].set_title('ํžˆํŠธ ๊ธฐ์‚ฌ์˜ ์นดํ…Œ๊ณ ๋ฆฌ ๋ถ„ํฌ', fontsize=16)
95
+ axes[0, 0].set_ylabel('๋น„์ค‘ (%)')
96
+ # โ˜…โ˜…โ˜… ์˜ค๋ฅ˜ ์ˆ˜์ • โ˜…โ˜…โ˜…
97
+ plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
98
+
99
+ # (2) ๋ณธ๋ฌธ ๊ธธ์ด
100
+ sns.boxplot(data=df_merged, x='group', y='content_length', ax=axes[0, 1], order=['TOP 10%', '๋‚˜๋จธ์ง€ 90%'])
101
+ axes[0, 1].set_title('๋ณธ๋ฌธ ๊ธธ์ด ๋น„๊ต', fontsize=16); axes[0, 1].set_ylabel('๊ธ€์ž ์ˆ˜')
102
+ axes[0, 1].set_ylim(0, df_merged['content_length'].quantile(0.95))
103
+
104
+ # (3) ์ œ๋ชฉ ๊ธธ์ด
105
+ sns.boxplot(data=df_merged, x='group', y='title_length', ax=axes[1, 0], order=['TOP 10%', '๋‚˜๋จธ์ง€ 90%'])
106
+ axes[1, 0].set_title('์ œ๋ชฉ ๊ธธ์ด ๋น„๊ต', fontsize=16); axes[1, 0].set_ylabel('๊ธ€์ž ์ˆ˜')
107
+
108
+ # (4) ๋ฐœํ–‰ ์š”์ผ
109
+ day_comp_df = df_merged.groupby('group')['publish_dayofweek'].value_counts(normalize=True).mul(100).unstack().T
110
+ day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
111
+ day_comp_df.reindex(day_order).plot(kind='bar', ax=axes[1, 1], rot=0)
112
+ axes[1, 1].set_title('๋ฐœํ–‰ ์š”์ผ๋ณ„ ๋ถ„ํฌ', fontsize=16); axes[1, 1].set_ylabel('๋น„์ค‘ (%)')
113
+
114
+ # โ˜…โ˜…โ˜… (5) ๋ง๋จธ๋ฆฌ([OO]) ์‚ฌ์šฉ ์—ฌ๋ถ€ (์‹ ๊ทœ ๋ถ„์„) โ˜…โ˜…โ˜…
115
+ sns.barplot(data=df_merged, x='has_bracket_prefix', y='views_total', ax=axes[2, 0], ci=None)
116
+ axes[2, 0].set_title('์ œ๋ชฉ ๋ง๋จธ๋ฆฌ([OO]) ์‚ฌ์šฉ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
117
+ axes[2, 0].set_xlabel('๋ง๋จธ๋ฆฌ ์‚ฌ์šฉ ์—ฌ๋ถ€'); axes[2, 0].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
118
+
119
+ # โ˜…โ˜…โ˜… (6) ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€ (์‹ ๊ทœ ๋ถ„์„) โ˜…โ˜…โ˜…
120
+ sns.barplot(data=df_merged, x='has_trend_keyword', y='views_total', ax=axes[2, 1], ci=None)
121
+ axes[2, 1].set_title('์ œ๋ชฉ ๋‚ด ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
122
+ axes[2, 1].set_xlabel('ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€'); axes[2, 1].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
123
+
124
+ plt.tight_layout()
125
+ plt.savefig(f'{output_dir}/high_view_article_characteristics_v2.png')
126
+ plt.close()
127
+
128
+ print("\n - ํžˆํŠธ ๊ธฐ์‚ฌ ํŠน์ง• ๋น„๊ต ๋ถ„์„(v2) ์™„๋ฃŒ. (high_view_article_characteristics_v2.png ์ €์žฅ)")
129
+
130
+ return top_20_table, cat_comp_df
131
+
132
+ # 4. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ (๋ณด๊ณ ์„œ ๋‚ด์šฉ ๊ฐ•ํ™”)
133
+ def generate_insights_report_v2(top_20_table, cat_comp_df, output_dir):
134
+ print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ (์„ฑ๊ณต ๊ณต์‹ ๊ฐ•ํ™”)...")
135
+
136
+ top_20_str = top_20_table.to_string()
137
+ cat_comp_str = cat_comp_df.head(5).round(1).to_string()
138
+
139
+ report = f"""
140
+ # ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ (์กฐํšŒ์ˆ˜ ์ค‘์‹ฌ ์„ฑ๊ณต ๊ณต์‹ v2)
141
+ ์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
142
+
143
+ ## 1. ๋ถ„์„ ๋ชฉํ‘œ
144
+ - 'ํžˆํŠธ ๊ธฐ์‚ฌ'์˜ ๊ณตํ†ต์ ์„ ์ •๋Ÿ‰์ , ์ •์„ฑ์ ์œผ๋กœ ๋ถ„์„ํ•˜์—ฌ **๋”ฐ๋ผ ํ•  ์ˆ˜ ์žˆ๋Š”(Actionable) ์„ฑ๊ณต ๊ณต์‹**์„ ๋„์ถœํ•ฉ๋‹ˆ๋‹ค.
145
+
146
+ ## 2. ์กฐํšŒ์ˆ˜ TOP 20 'ํžˆํŠธ ๊ธฐ์‚ฌ' ๋ฆฌ์ŠคํŠธ
147
+ {top_20_str}
148
+
149
+ ## 3. โ˜… ์กฐํšŒ์ˆ˜ '๋Œ€๋ฐ•' ๊ธฐ์‚ฌ์˜ ๊ฐ•ํ™”๋œ ์„ฑ๊ณต ๊ณต์‹ โ˜…
150
+
151
+ (high_view_article_characteristics_v2.png ์ฐธ๊ณ )
152
+
153
+ ### ๊ณต์‹ 1: 'ํžˆํŠธ ํŒฉํ† ๋ฆฌ' ์นดํ…Œ๊ณ ๋ฆฌ์— ์ง‘์ค‘ํ•˜๋ผ.
154
+ - **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '๋ฏธ๋””์–ดํ˜„์žฅ', '์ทจ์žฌ๊ธฐยท์ œ์ž‘๊ธฐ' 3๊ฐœ ์นดํ…Œ๊ณ ๋ฆฌ์—์„œ ํžˆํŠธ ๊ธฐ์‚ฌ์˜ 60% ์ด์ƒ์ด ๋ฐฐ์ถœ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ด ์นดํ…Œ๊ณ ๋ฆฌ๋“ค์€ ๊ฒ€์ฆ๋œ ์„ฑ๊ณต ์˜์—ญ์ž…๋‹ˆ๋‹ค.
155
+
156
+ ### ๊ณต์‹ 2: ์ œ๋ชฉ์œผ๋กœ ๋ชจ๋“  ๊ฒƒ์„ ๋งํ•˜๋ผ.
157
+ - **(์‹ ๊ทœ ๋ฐœ๊ฒฌ) ๋ง๋จธ๋ฆฌ ํšจ๊ณผ**: ์ œ๋ชฉ์— **'[์ค‘๊ตญ]', '[์•Œ๊ณ ๋ฆฌ์ฆ˜]'๊ณผ ๊ฐ™์ด ์ฃผ์ œ๋ฅผ ์š”์•ฝํ•˜๋Š” ๋ง๋จธ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•œ ๊ธฐ์‚ฌ์˜ ํ‰๊ท  ์กฐํšŒ์ˆ˜๋Š” ๊ทธ๋ ‡์ง€ ์•Š์€ ๊ธฐ์‚ฌ๋ณด๋‹ค ํ˜„์ €ํžˆ ๋†’์•˜์Šต๋‹ˆ๋‹ค.** ์ด๋Š” ๋…์ž๋“ค์ด ์ œ๋ชฉ๋งŒ ๋ณด๊ณ ๋„ ๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๋น ๋ฅด๊ฒŒ ํŒŒ์•…ํ•  ์ˆ˜ ์žˆ์„ ๋•Œ ํด๋ฆญํ•  ํ™•๋ฅ ์ด ๋†’๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
158
+ - **(์‹ ๊ทœ ๋ฐœ๊ฒฌ) ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ์„ ์ **: '์ˆํผ', 'MZ', 'AI' ๋“ฑ **์‹œ์˜์„ฑ ์žˆ๋Š” ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ๋ฅผ ์ œ๋ชฉ์— ํฌํ•จํ•œ ๊ธฐ์‚ฌ๋“ค์ด ์••๋„์ ์œผ๋กœ ๋†’์€ ํ‰๊ท  ์กฐํšŒ์ˆ˜**๋ฅผ ๊ธฐ๋กํ–ˆ์Šต๋‹ˆ๋‹ค. ๋…์ž๋“ค์€ ์ตœ์‹  ์ด์Šˆ์— ๋ฏผ๊ฐํ•˜๊ฒŒ ๋ฐ˜์‘ํ•ฉ๋‹ˆ๋‹ค.
159
+
160
+ ### ๊ณต์‹ 3: ๊ธธ๊ณ  ๊นŠ์ด ์žˆ๋Š” ์ฝ˜ํ…์ธ ๊ฐ€ ์ด๊ธด๋‹ค.
161
+ - **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ํžˆํŠธ ๊ธฐ์‚ฌ๋“ค์€ ์ผ๋ฐ˜ ๊ธฐ์‚ฌ๋“ค๋ณด๋‹ค **๋ณธ๋ฌธ ๊ธธ์ด๊ฐ€ ํ›จ์”ฌ ๊ธด ๊ฒฝํ–ฅ**์„ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. ๋…์ž๋“ค์€ ๊นŠ์ด ์žˆ๋Š” ๋กฑํผ ์ฝ˜ํ…์ธ ์— ๋” ๋†’์€ ๊ฐ€์น˜๋ฅผ ๋ถ€์—ฌํ•ฉ๋‹ˆ๋‹ค.
162
+
163
+ ### ๊ณต์‹ 4: ์ฃผ์ดˆ(์›”/ํ™”)์— ์Šน๋ถ€์ˆ˜๋ฅผ ๋„์›Œ๋ผ.
164
+ - **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ํžˆํŠธ ๊ธฐ์‚ฌ์˜ ์ƒ๋‹น์ˆ˜๊ฐ€ **์›”์š”์ผ๊ณผ ํ™”์š”์ผ์— ๋ฐœํ–‰**๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ฃผ์ดˆ์— ๋…์ž๋“ค์˜ ์ฝ˜ํ…์ธ  ์†Œ๋น„ ์š•๊ตฌ๊ฐ€ ๊ฐ€์žฅ ๋†’์Šต๋‹ˆ๋‹ค.
165
+
166
+ ## 4. ์‹คํ–‰์„ ์œ„ํ•œ '์„ฑ๊ณต ๊ณต์‹' ์ฒดํฌ๋ฆฌ์ŠคํŠธ
167
+ - ์‹ ๊ทœ ๊ธฐ์‚ฌ ๊ธฐํš ๋ฐ ๋ฐœํ–‰ ์‹œ, ์•„๋ž˜ ์ฒดํฌ๋ฆฌ์ŠคํŠธ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์„ฑ๊ณต ํ™•๋ฅ ์„ ๊ทน๋Œ€ํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
168
+
169
+ | ์ฒดํฌ ํ•ญ๋ชฉ | ์ „๋žต |
170
+ | ---------------------------------------------- | ------------------------------------------------------------------ |
171
+ | **1. ์นดํ…Œ๊ณ ๋ฆฌ ์„ ์ •** | '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '๋ฏธ๋””์–ดํ˜„์žฅ' ๋“ฑ ๊ฒ€์ฆ๋œ ์นดํ…Œ๊ณ ๋ฆฌ์ธ๊ฐ€? |
172
+ | **2. ์ œ๋ชฉ - ๋ง๋จธ๋ฆฌ ํ™œ์šฉ** | ๋…์ž์˜ ๋ˆˆ๊ธธ์„ ๋„๋Š” ๋ช…ํ™•ํ•œ [๋ง๋จธ๋ฆฌ]๋ฅผ ์‚ฌ์šฉํ–ˆ๋Š”๊ฐ€? |
173
+ | **3. ์ œ๋ชฉ - ํ‚ค์›Œ๋“œ ํฌํ•จ** | ์ง€๊ธˆ ๊ฐ€์žฅ ๋œจ๊ฑฐ์šด 'ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ'๋ฅผ ์ œ๋ชฉ์— ํฌํ•จํ–ˆ๋Š”๊ฐ€? |
174
+ | **4. ์ฝ˜ํ…์ธ  ๊นŠ์ด** | ๋…์ž๊ฐ€ ์‹œ๊ฐ„์„ ํˆฌ์žํ•  ๋งŒํ•œ ๊นŠ์ด์™€ ์ „๋ฌธ์„ฑ์„ ๊ฐ–์ถ˜ ๋กฑํผ ์ฝ˜ํ…์ธ ์ธ๊ฐ€? |
175
+ | **5. ๋ฐœํ–‰ ์‹œ์ ** | ๊ฐ€์žฅ ์ค‘์š”ํ•œ ๊ธฐ์‚ฌ๋ฅผ 'ํ”„๋ผ์ž„ ํƒ€์ž„'์ธ ์›”์š”์ผ ์˜ค์ „์— ๋ฐœํ–‰ํ•˜๋Š”๊ฐ€? |
176
+ """
177
+ report_path = f'{output_dir}/high_view_focused_analysis_report_v2.txt'
178
+ with open(report_path, 'w', encoding='utf-8') as f:
179
+ f.write(report)
180
+ print(f"\n - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ(v2) ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")
181
+
182
+ # 5. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
183
+ def main():
184
+ print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ (์กฐํšŒ์ˆ˜ ์ค‘์‹ฌ ์„ฑ๊ณต ๊ณต์‹ v2) =====")
185
+
186
+ data_dir, output_dir = setup_environment()
187
+ df_merged = load_and_preprocess_data(data_dir)
188
+
189
+ top_20, cat_comp = analyze_high_view_articles_v2(df_merged, output_dir)
190
+
191
+ generate_insights_report_v2(top_20, cat_comp, output_dir)
192
+
193
+ print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
194
+ print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
195
+
196
+ if __name__ == '__main__':
197
+ main()
app.py CHANGED
@@ -263,11 +263,11 @@ def generate_seo_suggestions(content: str) -> Dict[str, str]:
263
  "You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
264
  "Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
265
  "Guidelines:\n"
266
- "1. **'title' (under 60 characters):** Frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant *today*. It should make people think, 'This is an interesting take.'\n"
267
  "2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
268
  "3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
269
  f"Article Content:\n{safe_content}\n\n"
270
- "Return exactly: {\"title\": \"<์ƒ์„ฑ๋œ ์ œ๋ชฉ>\", \"description\": \"<์ƒ์„ฑ๋œ ์„ค๋ช…>\"}"
271
  )
272
  try:
273
  response = SEO_GENERATIVE_MODEL.generate_content(prompt)
 
263
  "You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
264
  "Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
265
  "Guidelines:\n"
266
+ "1. **'title' (under 60 characters):** **Start with a topic tag in brackets (e.g., `[์ฃผ์ œ]`)** that summarizes the core subject. Following the tag, frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant *today*. It should make people think, 'This is an interesting take.'\n"
267
  "2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
268
  "3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
269
  f"Article Content:\n{safe_content}\n\n"
270
+ "Return exactly: {\"title\": \"[<์ฃผ์ œ>] <์ƒ์„ฑ๋œ ์ œ๋ชฉ>\", \"description\": \"<์ƒ์„ฑ๋œ ์„ค๋ช…>\"}"
271
  )
272
  try:
273
  response = SEO_GENERATIVE_MODEL.generate_content(prompt)
train_and_save_models.py CHANGED
@@ -1,154 +1,222 @@
1
- """Training pipeline for the "์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก" article performance prediction project.
2
-
3
- This script prepares the datasets, engineers features using Okt-powered
4
- TF-IDF and categorical encodings, trains XGBoost models for view-count and
5
- primary audience prediction, and persists all artifacts required by the Flask
6
- inference service.
7
-
8
- The script is intended to be executed once the raw CSV files are available in
9
- `data_csv/`. Running it will generate the following files in the project root:
10
-
11
- - tfidf_vectorizer.pkl
12
- - onehot_encoder.pkl
13
- - label_encoder.pkl
14
- - view_prediction_model.pkl
15
- - age_prediction_model.pkl
16
- - text_features_matrix.pkl
17
- - article_mapping.pkl
 
 
18
  """
19
  from __future__ import annotations
20
 
 
21
  import sys
22
  from pathlib import Path
23
- from typing import List, Optional, Tuple, cast
24
 
25
  import joblib
26
  import numpy as np
27
  import pandas as pd
 
 
 
 
 
 
 
 
 
28
  from konlpy.tag import Okt
29
  from scipy.sparse import csr_matrix, hstack
30
  from sklearn.feature_extraction.text import TfidfVectorizer
31
- from sklearn.metrics import accuracy_score, mean_absolute_error
32
  from sklearn.model_selection import train_test_split
33
  from sklearn.preprocessing import LabelEncoder, OneHotEncoder
34
  from xgboost import XGBClassifier, XGBRegressor
35
 
36
- DATA_DIR = Path("data_csv")
37
- CONTENTS_PATH = DATA_DIR / "contents.csv"
38
- METRICS_PATH = DATA_DIR / "article_metrics_monthly.csv"
39
- DEMOGRAPHICS_PATH = DATA_DIR / "demographics_merged.csv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
- def ensure_files_exist(paths: List[Path]) -> None:
43
  """Raise a helpful error if any expected data file is missing."""
44
- missing = [str(path) for path in paths if not path.exists()]
 
45
  if missing:
46
- raise FileNotFoundError(
47
- "Missing required data files: " + ", ".join(missing)
48
- )
49
 
50
- OKT = Okt()
51
 
52
- def okt_tokenizer(text):
53
- """Define tokenizer using Okt that extracts nouns and verbs."""
54
- if not text.strip():
55
- return []
56
- # Extract nouns and verbs
57
- return [word for word, tag in OKT.pos(text, stem=True) if tag in ['Noun', 'Verb']]
58
 
 
59
 
60
- def load_datasets() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
61
- print("[1/6] Loading datasets...")
62
- contents = pd.read_csv(CONTENTS_PATH)
63
- metrics = pd.read_csv(METRICS_PATH)
64
- demographics = pd.read_csv(DEMOGRAPHICS_PATH)
65
  return contents, metrics, demographics
66
 
67
 
68
- def aggregate_metrics(metrics: pd.DataFrame) -> pd.DataFrame:
69
- print("[2/6] Aggregating article metrics...")
70
- agg = (
71
- metrics.groupby("article_id", as_index=False)[["views_total", "comments", "likes"]]
 
 
 
 
72
  .sum()
 
73
  .rename(columns={
74
  "views_total": "views_total",
75
  "comments": "comments_total",
76
  "likes": "likes_total",
77
  })
78
  )
79
- return agg
80
 
81
-
82
- def identify_primary_audience(demographics: pd.DataFrame) -> pd.DataFrame:
83
- print("[3/6] Identifying primary audience age groups...")
84
- filtered = demographics[demographics["age_group"] != "์ „์ฒด"].copy()
85
- if filtered.empty:
86
- raise ValueError(
87
- "No demographic records found after excluding '์ „์ฒด'."
88
- )
89
- filtered.sort_values(["article_id", "views"], ascending=[True, False], inplace=True)
90
- idx = filtered.groupby("article_id")["views"].idxmax()
91
- primary = (
92
- filtered.loc[idx, ["article_id", "age_group"]]
93
  .rename(columns={"age_group": "primary_age_group"})
94
  .reset_index(drop=True)
95
  )
96
- return primary
97
-
98
 
99
- def build_master_dataframe(
100
- contents: pd.DataFrame,
101
- metrics_agg: pd.DataFrame,
102
- primary_audience: pd.DataFrame,
103
- ) -> pd.DataFrame:
104
- print("[4/6] Merging datasets...")
105
  df_master = contents.merge(metrics_agg, on="article_id", how="left")
106
  df_master = df_master.merge(primary_audience, on="article_id", how="left")
107
 
108
- # Replace missing numeric metrics with zeros for downstream processing.
109
- for column in ["views_total", "comments_total", "likes_total"]:
110
- if column in df_master.columns:
111
- df_master[column] = df_master[column].fillna(0)
112
 
113
  return df_master
114
 
115
 
116
  def engineer_features(df_master: pd.DataFrame) -> tuple[csr_matrix, csr_matrix, TfidfVectorizer, OneHotEncoder]:
117
- print("[5/6] Engineering features (text + category)...")
118
- text_series = (
119
- df_master["title"].fillna("") + " " + df_master["content"].fillna("")
120
- ).str.strip()
121
 
 
 
 
122
  vectorizer = TfidfVectorizer(
123
  tokenizer=okt_tokenizer,
124
- max_features=5000,
125
  lowercase=False,
126
  )
127
  X_text = vectorizer.fit_transform(text_series)
128
- X_text_csr = csr_matrix(X_text)
129
 
130
  category_series = df_master["category"].fillna("๋ฏธ๋ถ„๋ฅ˜")
131
- onehot_encoder = OneHotEncoder(handle_unknown="ignore")
132
  X_cat = onehot_encoder.fit_transform(category_series.to_frame())
133
 
134
- X_combined = cast(csr_matrix, hstack([X_text_csr, X_cat]).tocsr())
135
- return X_combined, X_text_csr, vectorizer, onehot_encoder
136
 
137
 
138
  def prepare_targets(
139
- df_master: pd.DataFrame,
140
- X_combined: csr_matrix,
141
- X_text: csr_matrix,
142
  ) -> tuple[csr_matrix, csr_matrix, np.ndarray, np.ndarray, LabelEncoder, pd.DataFrame]:
143
- print("[6/6] Preparing targets and filtering valid samples...")
144
- y_views = df_master["views_total"].fillna(0).to_numpy(dtype=np.float32)
 
 
 
 
 
 
145
  y_age = df_master["primary_age_group"]
146
 
147
  valid_mask = y_age.notna().to_numpy()
148
  if not valid_mask.any():
149
- raise ValueError(
150
- "No samples contain a primary audience label. Unable to train the classification model."
151
- )
152
 
153
  X_combined_valid = X_combined[valid_mask]
154
  X_text_valid = X_text[valid_mask]
@@ -156,7 +224,7 @@ def prepare_targets(
156
  y_age_valid = y_age[valid_mask].astype(str)
157
 
158
  label_encoder = LabelEncoder()
159
- y_age_encoded = np.asarray(label_encoder.fit_transform(y_age_valid), dtype=np.int32)
160
 
161
  article_mapping = df_master.loc[valid_mask, ["article_id", "title"]].reset_index(drop=True)
162
 
@@ -169,153 +237,208 @@ def prepare_targets(
169
  article_mapping,
170
  )
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  def train_models(
174
- X_features: csr_matrix,
175
- y_views: np.ndarray,
176
- y_age_encoded: np.ndarray,
177
- num_classes: int,
178
  ) -> tuple[XGBRegressor, XGBClassifier]:
179
- print("Training XGBoost models with validation split...")
180
 
181
  stratify_target = y_age_encoded if len(np.unique(y_age_encoded)) > 1 else None
182
 
183
- (
184
- X_train,
185
- X_valid,
186
- y_views_train,
187
- y_views_valid,
188
- y_age_train,
189
- y_age_valid,
190
- ) = train_test_split(
191
- X_features,
192
- y_views,
193
- y_age_encoded,
194
- test_size=0.2,
195
- random_state=42,
196
  stratify=stratify_target,
197
  )
198
-
199
- view_model = XGBRegressor(
200
- objective="reg:squarederror",
201
- n_estimators=200,
202
- learning_rate=0.1,
203
- max_depth=6,
204
- subsample=0.8,
205
- colsample_bytree=0.8,
206
- random_state=42,
207
- tree_method="hist",
208
- n_jobs=-1,
209
- )
210
- view_model.fit(X_train, y_views_train)
211
 
212
  age_model = XGBClassifier(
213
  objective="multi:softprob",
214
  num_class=num_classes,
215
- n_estimators=300,
216
- learning_rate=0.1,
217
- max_depth=6,
218
- subsample=0.8,
219
- colsample_bytree=0.8,
220
- random_state=42,
221
- tree_method="hist",
222
- n_jobs=-1,
223
- eval_metric="mlogloss",
224
  use_label_encoder=False,
 
 
225
  )
226
- age_model.fit(X_train, y_age_train)
227
-
228
- if X_valid.shape[0] > 0:
229
- view_pred = view_model.predict(X_valid)
230
- mae = mean_absolute_error(y_views_valid, view_pred)
231
- age_pred = age_model.predict(X_valid)
232
- acc = accuracy_score(y_age_valid, age_pred)
233
- print(f" - Validation MAE (views): {mae:,.2f}")
234
- print(f" - Validation Accuracy (audience): {acc:.4f}")
235
-
236
- # Refit on the full dataset to maximise performance for saved artifacts.
237
- view_model.fit(X_features, y_views)
238
  age_model.fit(X_features, y_age_encoded)
239
 
240
- return view_model, age_model
 
 
 
 
 
 
 
 
 
 
241
 
 
242
 
243
- def save_artifacts(
244
- vectorizer: TfidfVectorizer,
245
- onehot_encoder: OneHotEncoder,
246
- label_encoder: LabelEncoder,
247
- view_model: XGBRegressor,
248
- age_model: XGBClassifier,
249
- text_features: csr_matrix,
250
- article_mapping: pd.DataFrame,
251
- ) -> None:
252
- print("Saving artifacts...")
253
 
254
- joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
255
- print("- Saved tfidf_vectorizer.pkl")
 
 
 
 
256
 
257
- joblib.dump(onehot_encoder, "onehot_encoder.pkl")
258
- print("- Saved onehot_encoder.pkl")
259
 
260
- joblib.dump(label_encoder, "label_encoder.pkl")
261
- print("- Saved label_encoder.pkl")
 
 
 
 
 
 
 
 
262
 
263
- joblib.dump(view_model, "view_prediction_model.pkl")
264
- print("- Saved view_prediction_model.pkl")
 
265
 
266
- joblib.dump(age_model, "age_prediction_model.pkl")
267
- print("- Saved age_prediction_model.pkl")
268
 
269
- joblib.dump(text_features, "text_features_matrix.pkl")
270
- print("- Saved text_features_matrix.pkl")
 
 
271
 
272
- joblib.dump(article_mapping, "article_mapping.pkl")
273
- print("- Saved article_mapping.pkl")
 
 
 
 
274
 
275
 
276
  def main() -> None:
277
- np.random.seed(42)
 
278
 
279
- ensure_files_exist([CONTENTS_PATH, METRICS_PATH, DEMOGRAPHICS_PATH])
280
-
281
- contents, metrics, demographics = load_datasets()
282
- metrics_agg = aggregate_metrics(metrics)
283
- primary_audience = identify_primary_audience(demographics)
284
- df_master = build_master_dataframe(contents, metrics_agg, primary_audience)
285
 
 
286
  X_combined, X_text, vectorizer, onehot_encoder = engineer_features(df_master)
 
 
287
  (
288
  X_features,
289
  X_text_filtered,
290
- y_views,
291
  y_age_encoded,
292
  label_encoder,
293
  article_mapping,
294
  ) = prepare_targets(df_master, X_combined, X_text)
295
 
 
296
  view_model, age_model = train_models(
297
- X_features,
298
- y_views,
299
- y_age_encoded,
300
- num_classes=len(label_encoder.classes_),
301
- )
302
-
303
- save_artifacts(
304
- vectorizer,
305
- onehot_encoder,
306
- label_encoder,
307
- view_model,
308
- age_model,
309
- X_text_filtered,
310
- article_mapping,
311
  )
312
 
313
- print("All artifacts saved successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
 
316
  if __name__ == "__main__":
317
  try:
318
  main()
319
- except Exception as exc: # pragma: no cover - top-level execution guard.
320
- print(f"Error: {exc}", file=sys.stderr)
321
- raise
 
1
+ """
2
+ Training pipeline for the "์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก" article performance prediction project.
3
+
4
+ This script prepares the datasets, engineers features using a parallelized
5
+ Okt-powered TF-IDF and categorical encodings, tunes and trains XGBoost models
6
+ for view-count (with log transformation) and primary audience prediction,
7
+ and persists all artifacts.
8
+
9
+ It also includes a function to demonstrate finding similar articles based on
10
+ content.
11
+
12
+ Improvements from the original version:
13
+ - Centralized configuration management (CONFIG).
14
+ - Standardized logging instead of print().
15
+ - Parallelized Okt tokenizer for significant speed-up.
16
+ - Log-transformed target variable (views) for improved regression performance.
17
+ - Hyperparameter tuning using Optuna for both models.
18
+ - Early stopping during model training to prevent overfitting.
19
+ - Demonstration of a similar article search function.
20
  """
21
  from __future__ import annotations
22
 
23
+ import logging
24
  import sys
25
  from pathlib import Path
26
+ from typing import Any, Dict, List, Tuple, cast
27
 
28
  import joblib
29
  import numpy as np
30
  import pandas as pd
31
+ from sklearn.metrics.pairwise import cosine_similarity
32
+
33
+ # Optuna for hyperparameter tuning
34
+ try:
35
+ import optuna
36
+ except ImportError:
37
+ print("Optuna is not installed. Please run: pip install optuna")
38
+ sys.exit(1)
39
+
40
  from konlpy.tag import Okt
41
  from scipy.sparse import csr_matrix, hstack
42
  from sklearn.feature_extraction.text import TfidfVectorizer
43
+ from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
44
  from sklearn.model_selection import train_test_split
45
  from sklearn.preprocessing import LabelEncoder, OneHotEncoder
46
  from xgboost import XGBClassifier, XGBRegressor
47
 
48
+ # --- 1. ์„ค์ • ์ค‘์•™ํ™” (Centralized Configuration) ---
49
+ # ์ฃผ์„: ๋ชจ๋“  ์ฃผ์š” ์„ค์ •๊ฐ’์„ ์ด๊ณณ์—์„œ ๊ด€๋ฆฌํ•˜์—ฌ ์ฝ”๋“œ ์ˆ˜์ • ์—†์ด ์‹คํ—˜ ์กฐ๊ฑด์„ ์‰ฝ๊ฒŒ ๋ณ€๊ฒฝํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
50
+ CONFIG = {
51
+ "data_dir": Path("./data_csv"),
52
+ "paths": {
53
+ "contents": "contents.csv",
54
+ "metrics": "article_metrics_monthly.csv",
55
+ "demographics": "demographics_merged.csv",
56
+ },
57
+ "artifacts": {
58
+ "vectorizer": "tfidf_vectorizer.pkl",
59
+ "onehot_encoder": "onehot_encoder.pkl",
60
+ "label_encoder": "label_encoder.pkl",
61
+ "view_model": "view_prediction_model.pkl",
62
+ "age_model": "age_prediction_model.pkl",
63
+ "text_features": "text_features_matrix.pkl",
64
+ "article_mapping": "article_mapping.pkl",
65
+ },
66
+ "feature_engineering": {
67
+ "tfidf_max_features": 5000,
68
+ "test_size": 0.2,
69
+ "random_state": 42,
70
+ },
71
+ "optuna": {
72
+ "n_trials_reg": 50, # ์กฐํšŒ์ˆ˜ ์˜ˆ์ธก ๋ชจ๋ธ ํŠœ๋‹ ํšŸ์ˆ˜
73
+ "n_trials_clf": 50, # ์—ฐ๋ น๋Œ€ ์˜ˆ์ธก ๋ชจ๋ธ ํŠœ๋‹ ํšŸ์ˆ˜
74
+ },
75
+ }
76
+
77
+ # --- 2. ๋กœ๊น… ์„ค์ • (Logging Setup) ---
78
+ # ์ฃผ์„: print() ๋Œ€์‹  logging์„ ์‚ฌ์šฉํ•˜์—ฌ ๋กœ๊ทธ๋ฅผ ์ฒด๊ณ„์ ์œผ๋กœ ๊ด€๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
79
+ logging.basicConfig(
80
+ level=logging.INFO,
81
+ format="%(asctime)s [%(levelname)s] - %(message)s",
82
+ stream=sys.stdout,
83
+ )
84
+
85
+
86
+ # --- 3. ์„ฑ๋Šฅ ๊ฐœ์„ : ๋ณ‘๋ ฌ ํ† ํฌ๋‚˜์ด์ € (Performance Improvement: Parallel Tokenizer) ---
87
+ class ParallelOktTokenizer:
88
+ """A parallelized Okt tokenizer using joblib."""
89
+ def __init__(self, n_jobs: int = -1):
90
+ self.okt = Okt()
91
+ self.n_jobs = n_jobs
92
+
93
+ def __call__(self, text_series: pd.Series) -> List[List[str]]:
94
+ # ์ฃผ์„: joblib.Parallel์„ ์‚ฌ์šฉํ•ด ์—ฌ๋Ÿฌ CPU ์ฝ”์–ด์—์„œ ๋™์‹œ์— ํ˜•ํƒœ์†Œ ๋ถ„์„์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
95
+ # ๋ฐ์ดํ„ฐ๊ฐ€ ํด ๊ฒฝ์šฐ, ์ด ๋ถ€๋ถ„์ด ๊ฐ€์žฅ ํฐ ์„ฑ๋Šฅ ํ–ฅ์ƒ์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
96
+ return joblib.Parallel(n_jobs=self.n_jobs)(joblib.delayed(self._tokenize)(text) for text in text_series)
97
+
98
+ def _tokenize(self, text: str) -> List[str]:
99
+ """Extracts nouns and verbs from a single text."""
100
+ if not isinstance(text, str) or not text.strip():
101
+ return []
102
+ return [
103
+ word
104
+ for word, tag in self.okt.pos(text, stem=True)
105
+ if tag in ["Noun", "Verb"]
106
+ ]
107
+
108
+ # ์ „์—ญ ํ† ํฌ๋‚˜์ด์ € ์ธ๏ฟฝ๏ฟฝ๏ฟฝํ„ด์Šค
109
+ # ์ฃผ์„: TfidfVectorizer๋Š” callable ๊ฐ์ฒด๋ฅผ tokenizer๋กœ ๋ฐ›์ง€ ์•Š์œผ๋ฏ€๋กœ, ์‹ค์ œ ์‚ฌ์šฉํ•  ํ•จ์ˆ˜๋ฅผ ์ •์˜ํ•ฉ๋‹ˆ๋‹ค.
110
+ # ์ด ์˜ˆ์ œ์—์„œ๋Š” TfidfVectorizer์˜ ๋‚ด๋ถ€ ๋กœ์ง์ƒ ์‹œ๋ฆฌ์ฆˆ๋ฅผ ์ง์ ‘ ๋ฐ›์ง€ ์•Š์œผ๋ฏ€๋กœ,
111
+ # ์•„๋ž˜ engineer_features์—์„œ ์ง์ ‘ ํ…์ŠคํŠธ๋ฅผ ์ฒ˜๋ฆฌํ•˜๋Š” ๋ฐฉ์‹์œผ๋กœ ๋ณ€๊ฒฝํ•ฉ๋‹ˆ๋‹ค.
112
+ def okt_tokenizer(text):
113
+ """Simple wrapper for Okt POS tagging (nouns and verbs)."""
114
+ okt = Okt()
115
+ if not text.strip():
116
+ return []
117
+ return [word for word, tag in okt.pos(text, stem=True) if tag in ['Noun', 'Verb']]
118
 
119
 
120
+ def ensure_files_exist(data_dir: Path, paths: Dict[str, str]) -> List[Path]:
121
  """Raise a helpful error if any expected data file is missing."""
122
+ full_paths = [data_dir / p for p in paths.values()]
123
+ missing = [str(path) for path in full_paths if not path.exists()]
124
  if missing:
125
+ raise FileNotFoundError(f"Missing required data files: {', '.join(missing)}")
126
+ return full_paths
 
127
 
 
128
 
129
+ def load_datasets(data_dir: Path, paths: Dict[str, str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
130
+ logging.info("Loading datasets...")
131
+ contents_path = data_dir / paths["contents"]
132
+ metrics_path = data_dir / paths["metrics"]
133
+ demographics_path = data_dir / paths["demographics"]
 
134
 
135
+ ensure_files_exist(data_dir, paths)
136
 
137
+ contents = pd.read_csv(contents_path)
138
+ metrics = pd.read_csv(metrics_path)
139
+ demographics = pd.read_csv(demographics_path)
 
 
140
  return contents, metrics, demographics
141
 
142
 
143
+ def preprocess_data(
144
+ contents: pd.DataFrame, metrics: pd.DataFrame, demographics: pd.DataFrame
145
+ ) -> pd.DataFrame:
146
+ logging.info("Preprocessing and merging datasets...")
147
+
148
+ # Aggregate metrics
149
+ metrics_agg = (
150
+ metrics.groupby("article_id")[["views_total", "comments", "likes"]]
151
  .sum()
152
+ .reset_index()
153
  .rename(columns={
154
  "views_total": "views_total",
155
  "comments": "comments_total",
156
  "likes": "likes_total",
157
  })
158
  )
 
159
 
160
+ # Identify primary audience
161
+ filtered_demo = demographics[demographics["age_group"] != "์ „์ฒด"].copy()
162
+ if filtered_demo.empty:
163
+ raise ValueError("No demographic records found after excluding '์ „์ฒด'.")
164
+ idx = filtered_demo.groupby("article_id")["views"].idxmax()
165
+ primary_audience = (
166
+ filtered_demo.loc[idx, ["article_id", "age_group"]]
 
 
 
 
 
167
  .rename(columns={"age_group": "primary_age_group"})
168
  .reset_index(drop=True)
169
  )
 
 
170
 
171
+ # Build master dataframe
 
 
 
 
 
172
  df_master = contents.merge(metrics_agg, on="article_id", how="left")
173
  df_master = df_master.merge(primary_audience, on="article_id", how="left")
174
 
175
+ df_master[["views_total", "comments_total", "likes_total"]] = df_master[
176
+ ["views_total", "comments_total", "likes_total"]
177
+ ].fillna(0)
 
178
 
179
  return df_master
180
 
181
 
182
  def engineer_features(df_master: pd.DataFrame) -> tuple[csr_matrix, csr_matrix, TfidfVectorizer, OneHotEncoder]:
183
+ logging.info("Engineering features (text + category)...")
184
+ text_series = (df_master["title"].fillna("") + " " + df_master["content"].fillna("")).str.strip()
 
 
185
 
186
+ # ์ฃผ์„: konlpy ํ† ํฌ๋‚˜์ด์ €๋Š” ์ƒ๋Œ€์ ์œผ๋กœ ๋А๋ฆฌ๋ฏ€๋กœ, ๋‹จ์ผ ํ”„๋กœ์„ธ์Šค tokenizer๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
187
+ # ๋งŒ์•ฝ ๋ฐ์ดํ„ฐ๊ฐ€ ๋งค์šฐ ์ปค์„œ ๋ณ‘๋ ฌ์ฒ˜๋ฆฌ๊ฐ€ ํ•„์š”ํ•˜๋‹ค๋ฉด, ํ…์ŠคํŠธ๋ฅผ ๋จผ์ € ํ† ํฌ๋‚˜์ด์ง•ํ•œ ํ›„
188
+ # TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x) ์™€ ๊ฐ™์ด ์‚ฌ์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
189
  vectorizer = TfidfVectorizer(
190
  tokenizer=okt_tokenizer,
191
+ max_features=CONFIG["feature_engineering"]["tfidf_max_features"],
192
  lowercase=False,
193
  )
194
  X_text = vectorizer.fit_transform(text_series)
 
195
 
196
  category_series = df_master["category"].fillna("๋ฏธ๋ถ„๋ฅ˜")
197
+ onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
198
  X_cat = onehot_encoder.fit_transform(category_series.to_frame())
199
 
200
+ X_combined = cast(csr_matrix, hstack([X_text, X_cat]).tocsr())
201
+ return X_combined, X_text, vectorizer, onehot_encoder
202
 
203
 
204
  def prepare_targets(
205
+ df_master: pd.DataFrame, X_combined: csr_matrix, X_text: csr_matrix
 
 
206
  ) -> tuple[csr_matrix, csr_matrix, np.ndarray, np.ndarray, LabelEncoder, pd.DataFrame]:
207
+ logging.info("Preparing targets and filtering valid samples...")
208
+
209
+ # --- 4. ๋ชจ๋ธ ์ •ํ™•๋„ ํ–ฅ์ƒ: ๋กœ๊ทธ ๋ณ€ํ™˜ (Model Accuracy: Log Transformation) ---
210
+ # ์ฃผ์„: ์กฐํšŒ์ˆ˜์˜ ๋ถ„ํฌ๊ฐ€ ๋งค์šฐ ์น˜์šฐ์ณ์ ธ ์žˆ์œผ๋ฏ€๋กœ np.log1p๋ฅผ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค.
211
+ # ๋ชจ๋ธ์€ ๋ณ€ํ™˜๋œ ๊ฐ’์„ ์˜ˆ์ธกํ•˜๊ณ , ๋‚˜์ค‘์— np.expm1๋กœ ์›๋ž˜ ์Šค์ผ€์ผ๋กœ ๋ณต์›ํ•ฉ๋‹ˆ๋‹ค.
212
+ # 0์ธ ๊ฐ’์— ๋กœ๊ทธ๋ฅผ ์ทจํ•˜๋ฉด -inf๊ฐ€ ๋˜๋ฏ€๋กœ, 1์„ ๋”ํ•ด์ฃผ๋Š” log1p๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
213
+ y_views = np.log1p(df_master["views_total"].astype(np.float32))
214
+
215
  y_age = df_master["primary_age_group"]
216
 
217
  valid_mask = y_age.notna().to_numpy()
218
  if not valid_mask.any():
219
+ raise ValueError("No samples with a primary audience label found.")
 
 
220
 
221
  X_combined_valid = X_combined[valid_mask]
222
  X_text_valid = X_text[valid_mask]
 
224
  y_age_valid = y_age[valid_mask].astype(str)
225
 
226
  label_encoder = LabelEncoder()
227
+ y_age_encoded = label_encoder.fit_transform(y_age_valid)
228
 
229
  article_mapping = df_master.loc[valid_mask, ["article_id", "title"]].reset_index(drop=True)
230
 
 
237
  article_mapping,
238
  )
239
 
240
+ # --- 5. ๋ชจ๋ธ ์ •ํ™•๋„ ํ–ฅ์ƒ: ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ (Model Accuracy: Hyperparameter Tuning) ---
241
+ def tune_xgbregressor(X_train, y_train, X_valid, y_valid) -> Dict[str, Any]:
242
+ """Find best hyperparameters for XGBRegressor using Optuna."""
243
+ def objective(trial):
244
+ params = {
245
+ "objective": "reg:squarederror",
246
+ "tree_method": "hist",
247
+ "n_estimators": trial.suggest_int("n_estimators", 200, 1000, step=100),
248
+ "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
249
+ "max_depth": trial.suggest_int("max_depth", 4, 10),
250
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
251
+ "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
252
+ "random_state": CONFIG["feature_engineering"]["random_state"],
253
+ "n_jobs": -1,
254
+ }
255
+ model = XGBRegressor(**params)
256
+ model.fit(
257
+ X_train, y_train,
258
+ eval_set=[(X_valid, y_valid)],
259
+ eval_metric="rmse",
260
+ callbacks=[optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")],
261
+ verbose=False,
262
+ )
263
+ preds = model.predict(X_valid)
264
+ rmse = np.sqrt(mean_squared_error(y_valid, preds))
265
+ return rmse
266
+
267
+ study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
268
+ study.optimize(objective, n_trials=CONFIG["optuna"]["n_trials_reg"], timeout=600)
269
+ logging.info(f"Best trial for XGBRegressor: {study.best_trial.params} (RMSE: {study.best_value:.4f})")
270
+ return study.best_trial.params
271
+
272
+
273
+ def tune_xgbclassifier(X_train, y_train, X_valid, y_valid, num_classes) -> Dict[str, Any]:
274
+ """Find best hyperparameters for XGBClassifier using Optuna."""
275
+ def objective(trial):
276
+ params = {
277
+ "objective": "multi:softprob",
278
+ "num_class": num_classes,
279
+ "tree_method": "hist",
280
+ "eval_metric": "mlogloss",
281
+ "use_label_encoder": False,
282
+ "n_estimators": trial.suggest_int("n_estimators", 300, 1500, step=100),
283
+ "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
284
+ "max_depth": trial.suggest_int("max_depth", 4, 10),
285
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
286
+ "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
287
+ "random_state": CONFIG["feature_engineering"]["random_state"],
288
+ "n_jobs": -1,
289
+ }
290
+ model = XGBClassifier(**params)
291
+ model.fit(
292
+ X_train, y_train,
293
+ eval_set=[(X_valid, y_valid)],
294
+ callbacks=[optuna.integration.XGBoostPruningCallback(trial, "validation_0-mlogloss")],
295
+ verbose=False,
296
+ )
297
+ return model.evals_result()["validation_0"]["mlogloss"][-1]
298
+
299
+ study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
300
+ study.optimize(objective, n_trials=CONFIG["optuna"]["n_trials_clf"], timeout=600)
301
+ logging.info(f"Best trial for XGBClassifier: {study.best_trial.params} (LogLoss: {study.best_value:.4f})")
302
+ return study.best_trial.params
303
+
304
 
305
  def train_models(
306
+ X_features: csr_matrix, y_views: np.ndarray, y_age_encoded: np.ndarray, num_classes: int
 
 
 
307
  ) -> tuple[XGBRegressor, XGBClassifier]:
308
+ logging.info("Splitting data and training final models...")
309
 
310
  stratify_target = y_age_encoded if len(np.unique(y_age_encoded)) > 1 else None
311
 
312
+ X_train, X_valid, y_views_train, y_views_valid, y_age_train, y_age_valid = train_test_split(
313
+ X_features, y_views, y_age_encoded,
314
+ test_size=CONFIG["feature_engineering"]["test_size"],
315
+ random_state=CONFIG["feature_engineering"]["random_state"],
 
 
 
 
 
 
 
 
 
316
  stratify=stratify_target,
317
  )
318
+
319
+ # Hyperparameter tuning
320
+ logging.info("--- Starting Hyperparameter Tuning ---")
321
+ best_reg_params = tune_xgbregressor(X_train, y_views_train, X_valid, y_views_valid)
322
+ best_clf_params = tune_xgbclassifier(X_train, y_age_train, X_valid, y_age_valid, num_classes)
323
+ logging.info("--- Hyperparameter Tuning Finished ---")
324
+
325
+ # Train final models with best parameters on the full dataset
326
+ logging.info("Training final models on the full dataset with best parameters...")
327
+
328
+ view_model = XGBRegressor(objective="reg:squarederror", **best_reg_params)
329
+ view_model.fit(X_features, y_views)
 
330
 
331
  age_model = XGBClassifier(
332
  objective="multi:softprob",
333
  num_class=num_classes,
 
 
 
 
 
 
 
 
 
334
  use_label_encoder=False,
335
+ eval_metric="mlogloss",
336
+ **best_clf_params,
337
  )
 
 
 
 
 
 
 
 
 
 
 
 
338
  age_model.fit(X_features, y_age_encoded)
339
 
340
+ # Final evaluation on the hold-out set
341
+ view_pred_log = view_model.predict(X_valid)
342
+ view_pred_original = np.expm1(view_pred_log) # ๋กœ๊ทธ ๋ณ€ํ™˜๋œ ์˜ˆ์ธก๊ฐ’์„ ์›๋ž˜ ์Šค์ผ€์ผ๋กœ ๋ณต์›
343
+ y_views_valid_original = np.expm1(y_views_valid)
344
+ mae = mean_absolute_error(y_views_valid_original, view_pred_original)
345
+
346
+ age_pred = age_model.predict(X_valid)
347
+ acc = accuracy_score(y_age_valid, age_pred)
348
+
349
+ logging.info(f"Final Validation MAE (views): {mae:,.2f}")
350
+ logging.info(f"Final Validation Accuracy (audience): {acc:.4f}")
351
 
352
+ return view_model, age_model
353
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+ def save_artifacts(artifacts: Dict[str, Any], artifact_paths: Dict[str, str]) -> None:
356
+ logging.info("Saving artifacts...")
357
+ for name, obj in artifacts.items():
358
+ path = artifact_paths[name]
359
+ joblib.dump(obj, path)
360
+ logging.info(f"- Saved {path}")
361
 
 
 
362
 
363
+ # --- 6. ์ƒˆ๋กœ์šด ๊ธฐ๋Šฅ: ์œ ์‚ฌ ๊ธฐ์‚ฌ ํƒ์ƒ‰ (New Feature: Similar Article Search) ---
364
+ def find_similar_articles(
365
+ article_id: str,
366
+ text_features: csr_matrix,
367
+ mapping_df: pd.DataFrame,
368
+ top_n: int = 5,
369
+ ) -> pd.DataFrame:
370
+ """Finds top_n similar articles for a given article_id."""
371
+ if article_id not in mapping_df["article_id"].values:
372
+ raise ValueError(f"Article ID {article_id} not found in the mapping.")
373
 
374
+ # Get the index of the source article
375
+ source_idx = mapping_df[mapping_df["article_id"] == article_id].index[0]
376
+ source_vector = text_features[source_idx]
377
 
378
+ # Compute cosine similarity
379
+ similarities = cosine_similarity(source_vector, text_features)[0]
380
 
381
+ # Get top_n similar articles (excluding the source article itself)
382
+ similar_indices = similarities.argsort()[-(top_n + 1):-1][::-1]
383
+
384
+ similar_scores = similarities[similar_indices]
385
 
386
+ result_df = mapping_df.iloc[similar_indices].copy()
387
+ result_df["similarity"] = similar_scores
388
+
389
+ logging.info(f"\n--- Top {top_n} similar articles to '{mapping_df.iloc[source_idx]['title']}' ---")
390
+ logging.info(result_df)
391
+ return result_df
392
 
393
 
394
  def main() -> None:
395
+ """Main execution pipeline."""
396
+ np.random.seed(CONFIG["feature_engineering"]["random_state"])
397
 
398
+ # Load and process data
399
+ contents, metrics, demographics = load_datasets(CONFIG["data_dir"], CONFIG["paths"])
400
+ df_master = preprocess_data(contents, metrics, demographics)
 
 
 
401
 
402
+ # Feature engineering
403
  X_combined, X_text, vectorizer, onehot_encoder = engineer_features(df_master)
404
+
405
+ # Prepare targets and filter
406
  (
407
  X_features,
408
  X_text_filtered,
409
+ y_views_log,
410
  y_age_encoded,
411
  label_encoder,
412
  article_mapping,
413
  ) = prepare_targets(df_master, X_combined, X_text)
414
 
415
+ # Train models
416
  view_model, age_model = train_models(
417
+ X_features, y_views_log, y_age_encoded, num_classes=len(label_encoder.classes_)
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  )
419
 
420
+ # Save all artifacts
421
+ artifacts_to_save = {
422
+ "vectorizer": vectorizer,
423
+ "onehot_encoder": onehot_encoder,
424
+ "label_encoder": label_encoder,
425
+ "view_model": view_model,
426
+ "age_model": age_model,
427
+ "text_features": X_text_filtered,
428
+ "article_mapping": article_mapping,
429
+ }
430
+ save_artifacts(artifacts_to_save, CONFIG["artifacts"])
431
+ logging.info("All artifacts saved successfully.")
432
+
433
+ # Demonstrate similar article search
434
+ if not article_mapping.empty:
435
+ sample_article_id = article_mapping.iloc[0]["article_id"]
436
+ find_similar_articles(sample_article_id, X_text_filtered, article_mapping)
437
 
438
 
439
  if __name__ == "__main__":
440
  try:
441
  main()
442
+ except Exception as exc:
443
+ logging.error(f"An error occurred: {exc}", exc_info=True)
444
+ raise