Broadcast_paper / data_structure_analysis.py
Choi jun hyeok
Deploy Flask app to HF Space
d4a3b8b
"""
์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ ๋ถ„์„ ์Šคํฌ๋ฆฝํŠธ
"""
import pandas as pd
import os
# ๋ฐ์ดํ„ฐ ํด๋” ๊ฒฝ๋กœ
data_dir = r'c:\Users\korea\Desktop\dacon_broadcast_paper\data'
# ๊ฐ ํŒŒ์ผ ๋ถ„์„
files = [
'article_metrics_monthly.xlsx',
'contents.xlsx',
'demographics_part001.xlsx',
'demographics_part002.xlsx',
'referrer.xlsx'
]
print("=" * 80)
print("์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ ๋ถ„์„")
print("=" * 80)
for file in files:
file_path = os.path.join(data_dir, file)
print(f"\n{'='*80}")
print(f"ํŒŒ์ผ๋ช…: {file}")
print(f"{'='*80}")
try:
# Excel ํŒŒ์ผ ์ฝ๊ธฐ
df = pd.read_excel(file_path)
# ๊ธฐ๋ณธ ์ •๋ณด
print(f"\n[๊ธฐ๋ณธ ์ •๋ณด]")
print(f"ํ–‰ ๊ฐœ์ˆ˜: {len(df):,}")
print(f"์—ด ๊ฐœ์ˆ˜: {len(df.columns)}")
print(f"์ „์ฒด ํฌ๊ธฐ: {df.shape}")
# ์ปฌ๋Ÿผ ์ •๋ณด
print(f"\n[์ปฌ๋Ÿผ ๋ชฉ๋ก ๋ฐ ๋ฐ์ดํ„ฐ ํƒ€์ž…]")
for idx, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1):
non_null = df[col].notna().sum()
null_count = df[col].isna().sum()
null_pct = (null_count / len(df)) * 100
print(f"{idx:2d}. {col:40s} | Type: {str(dtype):15s} | Non-Null: {non_null:,} | Null: {null_count:,} ({null_pct:.1f}%)")
# ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ (์ฒ˜์Œ 3ํ–‰)
print(f"\n[์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ (์ฒ˜์Œ 3ํ–‰)]")
print(df.head(3).to_string())
# ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰
memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
print(f"\n[๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰]: {memory_mb:.2f} MB")
except Exception as e:
print(f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
print("\n" + "=" * 80)
print("๋ถ„์„ ์™„๋ฃŒ!")
print("=" * 80)