Spaces:
Sleeping
Sleeping
| """ | |
| ์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋ ์ ๋ฐ์ดํฐ ๊ตฌ์กฐ ๋ถ์ ์คํฌ๋ฆฝํธ | |
| """ | |
| import pandas as pd | |
| import os | |
| # ๋ฐ์ดํฐ ํด๋ ๊ฒฝ๋ก | |
| data_dir = r'c:\Users\korea\Desktop\dacon_broadcast_paper\data' | |
| # ๊ฐ ํ์ผ ๋ถ์ | |
| files = [ | |
| 'article_metrics_monthly.xlsx', | |
| 'contents.xlsx', | |
| 'demographics_part001.xlsx', | |
| 'demographics_part002.xlsx', | |
| 'referrer.xlsx' | |
| ] | |
| print("=" * 80) | |
| print("์ ๋ฌธ๊ณผ๋ฐฉ์ก ๋ ์ ๋ฐ์ดํฐ ๊ตฌ์กฐ ๋ถ์") | |
| print("=" * 80) | |
| for file in files: | |
| file_path = os.path.join(data_dir, file) | |
| print(f"\n{'='*80}") | |
| print(f"ํ์ผ๋ช : {file}") | |
| print(f"{'='*80}") | |
| try: | |
| # Excel ํ์ผ ์ฝ๊ธฐ | |
| df = pd.read_excel(file_path) | |
| # ๊ธฐ๋ณธ ์ ๋ณด | |
| print(f"\n[๊ธฐ๋ณธ ์ ๋ณด]") | |
| print(f"ํ ๊ฐ์: {len(df):,}") | |
| print(f"์ด ๊ฐ์: {len(df.columns)}") | |
| print(f"์ ์ฒด ํฌ๊ธฐ: {df.shape}") | |
| # ์ปฌ๋ผ ์ ๋ณด | |
| print(f"\n[์ปฌ๋ผ ๋ชฉ๋ก ๋ฐ ๋ฐ์ดํฐ ํ์ ]") | |
| for idx, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1): | |
| non_null = df[col].notna().sum() | |
| null_count = df[col].isna().sum() | |
| null_pct = (null_count / len(df)) * 100 | |
| print(f"{idx:2d}. {col:40s} | Type: {str(dtype):15s} | Non-Null: {non_null:,} | Null: {null_count:,} ({null_pct:.1f}%)") | |
| # ์ํ ๋ฐ์ดํฐ (์ฒ์ 3ํ) | |
| print(f"\n[์ํ ๋ฐ์ดํฐ (์ฒ์ 3ํ)]") | |
| print(df.head(3).to_string()) | |
| # ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋ | |
| memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 | |
| print(f"\n[๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋]: {memory_mb:.2f} MB") | |
| except Exception as e: | |
| print(f"์ค๋ฅ ๋ฐ์: {str(e)}") | |
| print("\n" + "=" * 80) | |
| print("๋ถ์ ์๋ฃ!") | |
| print("=" * 80) | |