Spaces:
Sleeping
Sleeping
Choi jun hyeok
commited on
Commit
·
d4a3b8b
1
Parent(s):
1003766
Deploy Flask app to HF Space
Browse files- .env +1 -0
- .gitattributes +2 -0
- Dockerfile +33 -0
- EDA_analysis.html +0 -0
- README.md +134 -10
- age_prediction_model.pkl +3 -0
- app.py +432 -0
- article_mapping.pkl +3 -0
- convert_to_csv.py +83 -0
- data/article_metrics_monthly.xlsx +3 -0
- data/contents.xlsx +3 -0
- data/demographics_part001.xlsx +3 -0
- data/demographics_part002.xlsx +3 -0
- data/referrer.xlsx +3 -0
- data_csv/article_metrics_monthly.csv +3 -0
- data_csv/contents.csv +3 -0
- data_csv/demographics_merged.csv +3 -0
- data_csv/demographics_part001.csv +3 -0
- data_csv/demographics_part002.csv +3 -0
- data_csv/referrer.csv +3 -0
- data_structure_analysis.py +60 -0
- index.html +581 -0
- label_encoder.pkl +3 -0
- onehot_encoder.pkl +3 -0
- requirements.txt +11 -0
- text_features_matrix.pkl +3 -0
- tfidf_vectorizer.pkl +3 -0
- train_and_save_models.py +321 -0
- view_prediction_model.pkl +3 -0
- wsgi.py +9 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
GEMINI_API_KEY = AIzaSyDUK2u7PljRBqzj9lhM6Ydxm4-SdxCU-vY
|
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/*.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data_csv/*.csv filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 4 |
+
|
| 5 |
+
# System packages required for KoNLPy (Java) and MeCab tokenizer
|
| 6 |
+
RUN apt-get update \
|
| 7 |
+
&& apt-get install -y --no-install-recommends \
|
| 8 |
+
build-essential \
|
| 9 |
+
default-jdk \
|
| 10 |
+
mecab \
|
| 11 |
+
libmecab-dev \
|
| 12 |
+
mecab-ipadic-utf8 \
|
| 13 |
+
python3-dev \
|
| 14 |
+
&& apt-get clean \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
WORKDIR /app
|
| 18 |
+
|
| 19 |
+
# Install Python dependencies
|
| 20 |
+
COPY requirements.txt ./
|
| 21 |
+
RUN pip install --upgrade pip \
|
| 22 |
+
&& pip install --no-cache-dir -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# Copy application source and artifacts
|
| 25 |
+
COPY . .
|
| 26 |
+
|
| 27 |
+
ENV PYTHONPATH=/app
|
| 28 |
+
ENV PORT=7860
|
| 29 |
+
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
|
| 32 |
+
# Gunicorn respects the PORT provided by Hugging Face Spaces.
|
| 33 |
+
CMD ["sh", "-c", "gunicorn --bind 0.0.0.0:$PORT --workers ${GUNICORN_WORKERS:-2} wsgi:application"]
|
EDA_analysis.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1,10 +1,134 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dacon Broadcast Article Performance Predictor
|
| 2 |
+
|
| 3 |
+
This project hosts a Flask web application that predicts article performance and provides AI-powered SEO recommendations.
|
| 4 |
+
|
| 5 |
+
## Local development
|
| 6 |
+
|
| 7 |
+
1. Create a virtual environment and install dependencies.
|
| 8 |
+
```powershell
|
| 9 |
+
python -m venv .venv
|
| 10 |
+
.\.venv\Scripts\Activate.ps1
|
| 11 |
+
pip install -r requirements.txt
|
| 12 |
+
```
|
| 13 |
+
2. Ensure the model artifacts are generated:
|
| 14 |
+
```powershell
|
| 15 |
+
.\.venv\Scripts\python.exe train_and_save_models.py
|
| 16 |
+
```
|
| 17 |
+
3. Add your Google Generative AI key to a `.env` file:
|
| 18 |
+
```ini
|
| 19 |
+
GEMINI_API_KEY=your-api-key
|
| 20 |
+
```
|
| 21 |
+
4. Run the development server:
|
| 22 |
+
```powershell
|
| 23 |
+
.\.venv\Scripts\python.exe app.py
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## Production deployment (Gunicorn + Nginx)
|
| 27 |
+
|
| 28 |
+
1. **Copy project to server** (e.g., `/srv/dacon_broadcast_paper`).
|
| 29 |
+
2. **Create virtual environment** and install requirements as above.
|
| 30 |
+
3. **Generate artifacts** on the server or copy them from local build.
|
| 31 |
+
4. **Configure environment variables**:
|
| 32 |
+
```bash
|
| 33 |
+
echo "GEMINI_API_KEY=your-api-key" | sudo tee /etc/dacon_app.env
|
| 34 |
+
```
|
| 35 |
+
5. **Test Gunicorn manually**:
|
| 36 |
+
```bash
|
| 37 |
+
cd /srv/dacon_broadcast_paper
|
| 38 |
+
source .venv/bin/activate
|
| 39 |
+
gunicorn --bind 127.0.0.1:8000 --workers 3 --timeout 120 wsgi:application
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
### systemd service
|
| 43 |
+
|
| 44 |
+
Use `deploy/dacon_app.service` as a template:
|
| 45 |
+
```bash
|
| 46 |
+
sudo cp deploy/dacon_app.service /etc/systemd/system/dacon_app.service
|
| 47 |
+
sudo systemctl daemon-reload
|
| 48 |
+
sudo systemctl enable dacon_app
|
| 49 |
+
sudo systemctl start dacon_app
|
| 50 |
+
sudo systemctl status dacon_app
|
| 51 |
+
```
|
| 52 |
+
Adjust `WorkingDirectory`, `ExecStart`, and `Environment` entries to match your server paths or reference `/etc/dacon_app.env` with `EnvironmentFile=` if preferred.
|
| 53 |
+
|
| 54 |
+
### Nginx reverse proxy
|
| 55 |
+
|
| 56 |
+
1. Install Nginx (`sudo apt install nginx`).
|
| 57 |
+
2. Copy the provided config:
|
| 58 |
+
```bash
|
| 59 |
+
sudo cp deploy/dacon_app.nginx.conf /etc/nginx/sites-available/dacon_app
|
| 60 |
+
sudo ln -s /etc/nginx/sites-available/dacon_app /etc/nginx/sites-enabled/
|
| 61 |
+
sudo nginx -t
|
| 62 |
+
sudo systemctl reload nginx
|
| 63 |
+
```
|
| 64 |
+
3. Update `server_name` and any path aliases before reloading.
|
| 65 |
+
4. (Optional) Enable HTTPS via Certbot:
|
| 66 |
+
```bash
|
| 67 |
+
sudo apt install certbot python3-certbot-nginx
|
| 68 |
+
sudo certbot --nginx -d your-domain.com
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Firewall and health checks
|
| 72 |
+
|
| 73 |
+
- Open ports 80/443 via `ufw` or your cloud provider’s security group.
|
| 74 |
+
- Use the `/healthz` endpoint for health monitoring.
|
| 75 |
+
- Logs:
|
| 76 |
+
- Application: `journalctl -u dacon_app`
|
| 77 |
+
- Nginx: `/var/log/nginx/access.log`, `/var/log/nginx/error.log`
|
| 78 |
+
|
| 79 |
+
## File overview
|
| 80 |
+
|
| 81 |
+
- `app.py` – Flask application with prediction and SEO endpoints.
|
| 82 |
+
- `wsgi.py` – WSGI entrypoint for production servers.
|
| 83 |
+
- `deploy/dacon_app.service` – sample systemd unit for Gunicorn.
|
| 84 |
+
- `deploy/dacon_app.nginx.conf` – sample Nginx reverse proxy configuration.
|
| 85 |
+
- `train_and_save_models.py` – pipeline that creates required artifacts.
|
| 86 |
+
- `data_csv/` – CSV inputs used by the app.
|
| 87 |
+
|
| 88 |
+
## Troubleshooting
|
| 89 |
+
|
| 90 |
+
- If Gunicorn crashes, check for missing artifacts under `artifacts/`.
|
| 91 |
+
- Ensure the `.env` file or environment variables include `GEMINI_API_KEY`.
|
| 92 |
+
- Increase `client_max_body_size` in Nginx if large payloads are expected.
|
| 93 |
+
- For Windows hosting, consider running Gunicorn/Nginx via WSL2 or using IIS + FastCGI with `wsgi.py`.
|
| 94 |
+
|
| 95 |
+
## Hugging Face Spaces deployment (Docker Space)
|
| 96 |
+
|
| 97 |
+
Hugging Face Spaces support custom web apps through Docker. Use the provided `Dockerfile` to containerize the app and expose it via Gunicorn.
|
| 98 |
+
|
| 99 |
+
1. **Prepare the repository**
|
| 100 |
+
- Ensure all required artifacts (`*.pkl`) and the `data_csv/` folder are committed (Spaces pull the repo directly).
|
| 101 |
+
- Keep individual files under 1 GB (Spaces limit); use Git LFS for large artifacts if needed.
|
| 102 |
+
|
| 103 |
+
2. **Create a new Space**
|
| 104 |
+
- On Hugging Face, click **Create Space** → type `Docker` → name it (e.g., `username/dacon-predictor`).
|
| 105 |
+
- Leave hardware as default unless more RAM is required (~16 GB recommended because of NLP dependencies).
|
| 106 |
+
|
| 107 |
+
3. **Push the code**
|
| 108 |
+
- Initialize the Space as a Git repo locally:
|
| 109 |
+
```bash
|
| 110 |
+
huggingface-cli repo create username/dacon-predictor --type=space --space-sdk=docker
|
| 111 |
+
git remote add space https://huggingface.co/spaces/username/dacon-predictor
|
| 112 |
+
git push space main
|
| 113 |
+
```
|
| 114 |
+
- Alternatively, clone the empty Space repo and copy the project files into it before pushing.
|
| 115 |
+
|
| 116 |
+
4. **Secrets & configuration**
|
| 117 |
+
- In the Space settings, add a secret named `GEMINI_API_KEY` with your Google Generative AI key.
|
| 118 |
+
- Optional: set `GUNICORN_WORKERS` to tune concurrency.
|
| 119 |
+
|
| 120 |
+
5. **Container build**
|
| 121 |
+
- Spaces will build the `Dockerfile`. It installs system deps (OpenJDK, MeCab) and Python requirements, then launches Gunicorn binding to `$PORT` (HF uses port 7860 by default).
|
| 122 |
+
- The app serves `index.html` via Flask, so no additional frontend wiring is required.
|
| 123 |
+
|
| 124 |
+
6. **Testing & monitoring**
|
| 125 |
+
- Once the build finishes, open the Space URL to verify predictions and SEO generation.
|
| 126 |
+
- Check the Space logs (Settings → Logs) for build/runtime issues, especially MeCab/Java errors.
|
| 127 |
+
|
| 128 |
+
### Space-specific tips
|
| 129 |
+
|
| 130 |
+
- **Cold start latency**: Spaces sleep when idle; first request may take longer as the model artifacts load.
|
| 131 |
+
- **Resource usage**: If memory spikes occur (pandas + scikit-learn + MeCab), upgrade to a larger hardware tier.
|
| 132 |
+
- **Background tasks**: This setup serves HTTP requests only; long-running offline jobs should be run outside Spaces.
|
| 133 |
+
- **Security**: Secrets set in HF UI aren’t exposed in the repo. Avoid committing `.env` with real keys.
|
| 134 |
+
- **Custom domains**: Hugging Face supports domain mapping on paid tiers if you need branding.
|
age_prediction_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b88f49ca67c9b539dd935b49ae451a4a1524765ebfec865d7ec594e8ebcb6da6
|
| 3 |
+
size 4373490
|
app.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Flask backend for the "신문과방송" article performance prediction web app.
|
| 2 |
+
|
| 3 |
+
This server exposes prediction and metadata endpoints that rely on the
|
| 4 |
+
pre-trained artifacts produced during the offline training pipeline.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast
|
| 14 |
+
|
| 15 |
+
from flask import Flask, jsonify, request, send_from_directory # type: ignore[import]
|
| 16 |
+
from konlpy.tag import Okt
|
| 17 |
+
|
| 18 |
+
import joblib # type: ignore[import]
|
| 19 |
+
import numpy as np
|
| 20 |
+
from scipy.sparse import csr_matrix, hstack
|
| 21 |
+
from sklearn.metrics.pairwise import cosine_similarity # type: ignore[import]
|
| 22 |
+
|
| 23 |
+
from dotenv import load_dotenv
|
| 24 |
+
import google.generativeai as genai
|
| 25 |
+
|
| 26 |
+
# Optional dependency: pandas is only required for category input handling.
|
| 27 |
+
try:
|
| 28 |
+
import pandas as pd
|
| 29 |
+
except ImportError: # pragma: no cover - pandas should be available, but we guard just in case.
|
| 30 |
+
pd = None # type: ignore
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if pd is None:
|
| 34 |
+
raise RuntimeError(
|
| 35 |
+
"pandas is required for this application. Please install pandas in the runtime environment."
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
load_dotenv()
|
| 40 |
+
|
| 41 |
+
API_KEY = os.getenv("GEMINI_API_KEY")
|
| 42 |
+
if not API_KEY:
|
| 43 |
+
raise RuntimeError("GEMINI_API_KEY is not set. Please define it in your .env file.")
|
| 44 |
+
|
| 45 |
+
genai.configure(api_key=API_KEY) # type: ignore[attr-defined]
|
| 46 |
+
SEO_MODEL_NAME = "gemma-3-27b-it"
|
| 47 |
+
SEO_GENERATIVE_MODEL = genai.GenerativeModel(SEO_MODEL_NAME) # type: ignore[attr-defined]
|
| 48 |
+
|
| 49 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 50 |
+
ARTIFACT_DIR = BASE_DIR / "artifacts"
|
| 51 |
+
DATA_DIR = BASE_DIR / "data_csv"
|
| 52 |
+
CONTENTS_CSV = DATA_DIR / "contents.csv"
|
| 53 |
+
|
| 54 |
+
logger = logging.getLogger(__name__)
|
| 55 |
+
logging.basicConfig(level=logging.INFO)
|
| 56 |
+
|
| 57 |
+
# Okt 객체를 전역적으로 초기화합니다.
|
| 58 |
+
OKT = Okt()
|
| 59 |
+
|
| 60 |
+
def okt_tokenizer(text: str) -> list[str]:
|
| 61 |
+
"""
|
| 62 |
+
Tokenize text using Okt to extract nouns and verbs.
|
| 63 |
+
This function must be defined in the same way as in the training script
|
| 64 |
+
for the TfidfVectorizer to be loaded correctly.
|
| 65 |
+
"""
|
| 66 |
+
if not isinstance(text, str) or not text.strip():
|
| 67 |
+
return []
|
| 68 |
+
# `stem=True`는 단어를 원형으로 복원해주는 옵션입니다 (e.g., '달렸다' -> '달리다').
|
| 69 |
+
# 훈련 스크립트와 동일하게 유지해야 합니다.
|
| 70 |
+
return [
|
| 71 |
+
word
|
| 72 |
+
for word, tag in OKT.pos(text, stem=True)
|
| 73 |
+
if tag in ["Noun", "Verb"]
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
def _resolve_artifact_path(filename: str) -> Path:
|
| 77 |
+
"""Return the most likely path for a persisted artifact.
|
| 78 |
+
|
| 79 |
+
The training pipeline may save artifacts either in the project root or in an
|
| 80 |
+
`artifacts/` sub-directory. We attempt both locations for convenience and to
|
| 81 |
+
provide a clear error if the file cannot be found.
|
| 82 |
+
"""
|
| 83 |
+
direct_path = BASE_DIR / filename
|
| 84 |
+
if direct_path.exists():
|
| 85 |
+
return direct_path
|
| 86 |
+
|
| 87 |
+
artifacts_path = ARTIFACT_DIR / filename
|
| 88 |
+
if artifacts_path.exists():
|
| 89 |
+
return artifacts_path
|
| 90 |
+
|
| 91 |
+
search_locations = [str(direct_path), str(artifacts_path)]
|
| 92 |
+
raise FileNotFoundError(
|
| 93 |
+
f"Artifact '{filename}' could not be located. Looked in: {search_locations}"
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _load_artifact(filename: str) -> Any:
|
| 98 |
+
"""Load a pickled artifact using joblib with helpful error messaging."""
|
| 99 |
+
path = _resolve_artifact_path(filename)
|
| 100 |
+
return joblib.load(path)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
app = Flask(__name__, static_folder=".", template_folder=".")
|
| 104 |
+
|
| 105 |
+
# --- Artifact Loading -------------------------------------------------------------------------
|
| 106 |
+
try:
|
| 107 |
+
tfidf_vectorizer = _load_artifact("tfidf_vectorizer.pkl")
|
| 108 |
+
onehot_encoder = _load_artifact("onehot_encoder.pkl")
|
| 109 |
+
label_encoder = _load_artifact("label_encoder.pkl")
|
| 110 |
+
view_prediction_model = _load_artifact("view_prediction_model.pkl")
|
| 111 |
+
age_prediction_model = _load_artifact("age_prediction_model.pkl")
|
| 112 |
+
text_features_matrix = _load_artifact("text_features_matrix.pkl")
|
| 113 |
+
article_mapping = _load_artifact("article_mapping.pkl")
|
| 114 |
+
except FileNotFoundError as exc: # pragma: no cover - occurs only if artifacts missing.
|
| 115 |
+
# Fail fast during startup so the issue can be resolved immediately.
|
| 116 |
+
raise RuntimeError(
|
| 117 |
+
"Required model artifacts are missing. Ensure Phase 1-2 outputs are saved before "
|
| 118 |
+
"starting the server."
|
| 119 |
+
) from exc
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if not isinstance(text_features_matrix, csr_matrix):
|
| 123 |
+
# Convert any compatible sparse matrix to CSR format for efficient row slicing.
|
| 124 |
+
text_features_matrix = csr_matrix(text_features_matrix)
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
contents_dataframe = pd.read_csv(CONTENTS_CSV)
|
| 128 |
+
except FileNotFoundError as exc:
|
| 129 |
+
raise RuntimeError(
|
| 130 |
+
f"Required contents dataset not found at {CONTENTS_CSV}."
|
| 131 |
+
) from exc
|
| 132 |
+
|
| 133 |
+
article_content_lookup: Dict[Any, str] = {}
|
| 134 |
+
if "article_id" in contents_dataframe.columns and "content" in contents_dataframe.columns:
|
| 135 |
+
article_content_lookup = {
|
| 136 |
+
str(row.article_id): (row.content if isinstance(row.content, str) else "")
|
| 137 |
+
for row in contents_dataframe.itertuples()
|
| 138 |
+
}
|
| 139 |
+
else: # pragma: no cover - dataset schema mismatch
|
| 140 |
+
raise RuntimeError(
|
| 141 |
+
"contents.csv must contain 'article_id' and 'content' columns."
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
_encoded_categories: List[str]
|
| 146 |
+
try:
|
| 147 |
+
categories_arr = getattr(onehot_encoder, "categories_", None)
|
| 148 |
+
if categories_arr:
|
| 149 |
+
_encoded_categories = sorted(str(cat) for cat in categories_arr[0])
|
| 150 |
+
else:
|
| 151 |
+
_encoded_categories = []
|
| 152 |
+
except AttributeError: # pragma: no cover
|
| 153 |
+
_encoded_categories = []
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _ensure_dataframe(category: str) -> Any:
|
| 157 |
+
"""Create a minimal DataFrame for category encoding.
|
| 158 |
+
|
| 159 |
+
Falls back to a simple dictionary-based DataFrame when pandas is available,
|
| 160 |
+
otherwise raises a clear ImportError with remediation guidance.
|
| 161 |
+
"""
|
| 162 |
+
if pd is None:
|
| 163 |
+
raise ImportError(
|
| 164 |
+
"pandas is required to prepare categorical inputs. Please install pandas or "
|
| 165 |
+
"ensure the training environment's dependencies are mirrored here."
|
| 166 |
+
)
|
| 167 |
+
return pd.DataFrame({"category": [category]})
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _lookup_article_metadata(index: int) -> Dict[str, Any]:
|
| 171 |
+
"""Return metadata for a given row index in the text feature matrix.
|
| 172 |
+
|
| 173 |
+
During Phase 2 the pipeline should persist either a pandas DataFrame or a
|
| 174 |
+
dictionary-like mapping that provides `article_id` and `title` fields. This
|
| 175 |
+
helper normalises the structure so the API can rely on consistent keys.
|
| 176 |
+
"""
|
| 177 |
+
if isinstance(article_mapping, dict):
|
| 178 |
+
entry = article_mapping.get(index)
|
| 179 |
+
if entry is None:
|
| 180 |
+
entry = article_mapping.get(str(index))
|
| 181 |
+
if isinstance(entry, dict):
|
| 182 |
+
return {
|
| 183 |
+
"id": entry.get("article_id") or entry.get("id"),
|
| 184 |
+
"title": entry.get("title") or entry.get("article_title"),
|
| 185 |
+
}
|
| 186 |
+
if isinstance(entry, (list, tuple)) and entry:
|
| 187 |
+
entry_seq = cast(Sequence[Any], entry)
|
| 188 |
+
article_id = entry_seq[0]
|
| 189 |
+
article_title = entry_seq[1] if len(entry_seq) > 1 else entry_seq[0]
|
| 190 |
+
return {"id": article_id, "title": article_title}
|
| 191 |
+
|
| 192 |
+
mapping_obj = cast(Any, article_mapping)
|
| 193 |
+
if hasattr(mapping_obj, "iloc"):
|
| 194 |
+
# Supports pandas DataFrame or Series-like objects with iloc.
|
| 195 |
+
try:
|
| 196 |
+
row = mapping_obj.iloc[index]
|
| 197 |
+
except Exception: # pragma: no cover - defensive guard.
|
| 198 |
+
row = None
|
| 199 |
+
if row is not None:
|
| 200 |
+
if hasattr(row, "to_dict"):
|
| 201 |
+
row_dict = row.to_dict()
|
| 202 |
+
return {
|
| 203 |
+
"id": row_dict.get("article_id") or row_dict.get("id"),
|
| 204 |
+
"title": row_dict.get("title") or row_dict.get("article_title"),
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
# Fallback: surface the index so downstream consumers can still show something.
|
| 208 |
+
return {"id": int(index), "title": f"Article #{index}"}
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def _find_similar_articles(query_vector: csr_matrix, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 212 |
+
"""Return the top-k most similar articles for the provided query vector."""
|
| 213 |
+
similarities = cosine_similarity(query_vector, text_features_matrix).ravel()
|
| 214 |
+
|
| 215 |
+
# Sort indices by descending similarity.
|
| 216 |
+
ranked_indices = np.argsort(similarities)[::-1]
|
| 217 |
+
|
| 218 |
+
similar_articles: List[Dict[str, Any]] = []
|
| 219 |
+
for idx in ranked_indices:
|
| 220 |
+
score = float(similarities[idx])
|
| 221 |
+
|
| 222 |
+
# Skip near-identical matches (useful if querying an existing article).
|
| 223 |
+
if score >= 0.9999 and not similar_articles:
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
metadata = _lookup_article_metadata(int(idx))
|
| 227 |
+
metadata.update({"similarity": round(score, 4)})
|
| 228 |
+
similar_articles.append(metadata)
|
| 229 |
+
if len(similar_articles) >= top_k:
|
| 230 |
+
break
|
| 231 |
+
|
| 232 |
+
return similar_articles
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _extract_first_sentence(text: str) -> str:
|
| 236 |
+
"""Return the first sentence-like fragment from the provided text."""
|
| 237 |
+
if not text:
|
| 238 |
+
return ""
|
| 239 |
+
cleaned = re.sub(r"\s+", " ", text).strip()
|
| 240 |
+
if not cleaned:
|
| 241 |
+
return ""
|
| 242 |
+
sentence_endings = re.split(r"(?<=[.!??!。])\s+", cleaned)
|
| 243 |
+
for fragment in sentence_endings:
|
| 244 |
+
fragment = fragment.strip()
|
| 245 |
+
if fragment:
|
| 246 |
+
return fragment
|
| 247 |
+
return cleaned[:80]
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def generate_seo_suggestions(content: str) -> Dict[str, str]:
|
| 251 |
+
"""Generate SEO title and description using Google Gemini."""
|
| 252 |
+
safe_content = content or ""
|
| 253 |
+
safe_content = re.sub(r"\s+", " ", safe_content).strip()
|
| 254 |
+
|
| 255 |
+
prompt = (
|
| 256 |
+
"You are a lead digital editor for a korean prestigious online media company that bridges in-depth analysis with current trends. "
|
| 257 |
+
"Your mission is to craft an SEO title and description that are both intelligent and highly shareable. The goal is to highlight the article's most timely, newsworthy, and debate-sparking elements to maximize public interest and social engagement.\n\n"
|
| 258 |
+
"Guidelines:\n"
|
| 259 |
+
"1. **'title' (under 60 characters):** Frame the core topic as a compelling thesis or a provocative question. Connect it to a current conversation or a surprising trend to make it feel urgent and relevant *today*. It should make people think, 'This is an interesting take.'\n"
|
| 260 |
+
"2. **'description' (under 150 characters, in Korean):** Go beyond summary. Contextualize the article's importance. Explain *why* this topic matters *now* and what new perspective the article offers on a familiar issue. It should persuade readers that this article will give them a crucial viewpoint for today's conversations.\n"
|
| 261 |
+
"3. **Format:** Respond strictly with a valid JSON object with 'title' and 'description' keys. Avoid generic phrases, clickbait, and anything that undermines the intellectual integrity of the brand.\n\n"
|
| 262 |
+
f"Article Content:\n{safe_content}\n\n"
|
| 263 |
+
"Return exactly: {\"title\": \"<생성된 제목>\", \"description\": \"<생성된 설명>\"}"
|
| 264 |
+
)
|
| 265 |
+
try:
|
| 266 |
+
response = SEO_GENERATIVE_MODEL.generate_content(prompt)
|
| 267 |
+
raw_text = getattr(response, "text", "") or ""
|
| 268 |
+
|
| 269 |
+
if not raw_text and getattr(response, "candidates", None):
|
| 270 |
+
collected_parts: List[str] = []
|
| 271 |
+
for candidate in response.candidates: # type: ignore[attr-defined]
|
| 272 |
+
candidate_content: Any = getattr(candidate, "content", None)
|
| 273 |
+
parts = getattr(candidate_content, "parts", None) if candidate_content else None
|
| 274 |
+
if parts:
|
| 275 |
+
for part in parts:
|
| 276 |
+
text_part = getattr(part, "text", None)
|
| 277 |
+
if text_part:
|
| 278 |
+
collected_parts.append(str(text_part))
|
| 279 |
+
raw_text = " ".join(collected_parts)
|
| 280 |
+
|
| 281 |
+
cleaned_text = raw_text.strip()
|
| 282 |
+
if not cleaned_text:
|
| 283 |
+
raise ValueError("SEO model returned an empty response")
|
| 284 |
+
|
| 285 |
+
if cleaned_text.startswith("```"):
|
| 286 |
+
cleaned_text = re.sub(r"^```(?:json)?", "", cleaned_text, flags=re.IGNORECASE).strip()
|
| 287 |
+
cleaned_text = re.sub(r"```$", "", cleaned_text).strip()
|
| 288 |
+
|
| 289 |
+
match = re.search(r"{.*}", cleaned_text, re.DOTALL)
|
| 290 |
+
json_payload = match.group(0) if match else cleaned_text
|
| 291 |
+
|
| 292 |
+
seo_json = json.loads(json_payload)
|
| 293 |
+
suggested_title = str(seo_json.get("title", "")).strip()
|
| 294 |
+
suggested_description = str(seo_json.get("description", "")).strip()
|
| 295 |
+
|
| 296 |
+
if not suggested_title or not suggested_description:
|
| 297 |
+
raise ValueError("SEO model response missing required fields")
|
| 298 |
+
|
| 299 |
+
return {
|
| 300 |
+
"suggested_title": suggested_title[:60],
|
| 301 |
+
"suggested_description": suggested_description[:150],
|
| 302 |
+
}
|
| 303 |
+
except Exception as exc: # pragma: no cover - external API failures
|
| 304 |
+
logger.error("SEO model (%s) generation failed: %s", SEO_MODEL_NAME, exc)
|
| 305 |
+
fallback_title = _extract_first_sentence(safe_content) or safe_content[:60]
|
| 306 |
+
fallback_description = safe_content[:150]
|
| 307 |
+
return {
|
| 308 |
+
"suggested_title": fallback_title,
|
| 309 |
+
"suggested_description": fallback_description,
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
@app.route("/", methods=["GET"])
|
| 314 |
+
def serve_index() -> Any:
|
| 315 |
+
"""Serve the single-page frontend."""
|
| 316 |
+
template_dir = app.template_folder or "."
|
| 317 |
+
return send_from_directory(template_dir, "index.html")
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
@app.route("/healthz", methods=["GET"])
|
| 321 |
+
def healthcheck() -> Any:
|
| 322 |
+
"""Simple health check endpoint."""
|
| 323 |
+
return jsonify({"status": "ok"})
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
@app.route("/categories", methods=["GET"])
|
| 327 |
+
def list_categories() -> Any:
|
| 328 |
+
"""Expose category options inferred from the fitted OneHotEncoder."""
|
| 329 |
+
return jsonify({"categories": _encoded_categories})
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
@app.route("/predict", methods=["POST"])
|
| 333 |
+
def predict() -> Any:
|
| 334 |
+
payload = request.get_json(silent=True) or {}
|
| 335 |
+
|
| 336 |
+
required_fields = {"title", "content", "category"}
|
| 337 |
+
missing = [field for field in required_fields if not payload.get(field)]
|
| 338 |
+
if missing:
|
| 339 |
+
return (
|
| 340 |
+
jsonify({"error": f"Missing required fields: {', '.join(missing)}"}),
|
| 341 |
+
400,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
title: str = str(payload.get("title", "")).strip()
|
| 345 |
+
content: str = str(payload.get("content", "")).strip()
|
| 346 |
+
category: str = str(payload.get("category", "")).strip()
|
| 347 |
+
|
| 348 |
+
combined_text = f"{title} {content}".strip()
|
| 349 |
+
if not combined_text:
|
| 350 |
+
return jsonify({"error": "Title and content cannot both be empty."}), 400
|
| 351 |
+
|
| 352 |
+
text_vector = tfidf_vectorizer.transform([combined_text])
|
| 353 |
+
|
| 354 |
+
try:
|
| 355 |
+
category_frame = _ensure_dataframe(category)
|
| 356 |
+
category_vector = onehot_encoder.transform(category_frame[["category"]])
|
| 357 |
+
except Exception as exc:
|
| 358 |
+
return jsonify({"error": f"Failed to encode category: {exc}"}), 400
|
| 359 |
+
|
| 360 |
+
feature_vector = hstack([text_vector, category_vector])
|
| 361 |
+
|
| 362 |
+
view_prediction = view_prediction_model.predict(feature_vector)[0]
|
| 363 |
+
predicted_views = int(round(float(view_prediction)))
|
| 364 |
+
|
| 365 |
+
age_prediction = age_prediction_model.predict(feature_vector)[0]
|
| 366 |
+
predicted_age_index = int(age_prediction)
|
| 367 |
+
|
| 368 |
+
similar_articles_raw = _find_similar_articles(text_vector, top_k=5)
|
| 369 |
+
|
| 370 |
+
similar_articles: List[Dict[str, Any]] = []
|
| 371 |
+
for article in similar_articles_raw:
|
| 372 |
+
article_id = article.get("id")
|
| 373 |
+
article_title = article.get("title")
|
| 374 |
+
lookup_key = str(article_id) if article_id is not None else ""
|
| 375 |
+
content_text = article_content_lookup.get(lookup_key, "")
|
| 376 |
+
summary = content_text.strip()[:100]
|
| 377 |
+
similar_articles.append(
|
| 378 |
+
{
|
| 379 |
+
"id": article_id,
|
| 380 |
+
"title": article_title,
|
| 381 |
+
"summary": summary,
|
| 382 |
+
}
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
try:
|
| 386 |
+
decoded_age_group = label_encoder.inverse_transform([predicted_age_index])[0]
|
| 387 |
+
except Exception:
|
| 388 |
+
decoded_age_group = str(predicted_age_index)
|
| 389 |
+
|
| 390 |
+
seo_recommendation = generate_seo_suggestions(content)
|
| 391 |
+
seo_simulation = {
|
| 392 |
+
"current_state": {
|
| 393 |
+
"issue": "메타 정보에 핵심 키워드가 부족하고 설명이 너무 길어 SERP에서 잘립니다.",
|
| 394 |
+
"title": title[:70] or "제목이 입력되지 않았습니다.",
|
| 395 |
+
"description": (content[:150] + ("..." if len(content) > 150 else "")) if content else "본문이 입력되지 않았습니다.",
|
| 396 |
+
},
|
| 397 |
+
"recommended_state": {
|
| 398 |
+
"title": seo_recommendation["suggested_title"],
|
| 399 |
+
"description": seo_recommendation["suggested_description"],
|
| 400 |
+
},
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
response_payload = {
|
| 404 |
+
"ai_prediction": {
|
| 405 |
+
"predicted_views": predicted_views,
|
| 406 |
+
"predicted_age_group": decoded_age_group,
|
| 407 |
+
"similar_articles": similar_articles,
|
| 408 |
+
},
|
| 409 |
+
"seo_simulation": seo_simulation,
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
return jsonify(response_payload)
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
@app.route("/generate-description", methods=["POST"])
|
| 416 |
+
def generate_description() -> Any:
|
| 417 |
+
payload = request.get_json(silent=True) or {}
|
| 418 |
+
content_text = str(payload.get("content", ""))
|
| 419 |
+
if not content_text.strip():
|
| 420 |
+
return jsonify({"error": "Content is required to generate a description."}), 400
|
| 421 |
+
|
| 422 |
+
suggestions = generate_seo_suggestions(content_text)
|
| 423 |
+
return jsonify(
|
| 424 |
+
{
|
| 425 |
+
"title": suggestions.get("suggested_title", ""),
|
| 426 |
+
"description": suggestions.get("suggested_description", ""),
|
| 427 |
+
}
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
if __name__ == "__main__": # pragma: no cover - manual execution only.
|
| 432 |
+
app.run(host="0.0.0.0", port=5000, debug=False)
|
article_mapping.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5edb3af7db4f3664dd57d9a0f38460154d421b27c2e8f034533f04c90008330c
|
| 3 |
+
size 212049
|
convert_to_csv.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Excel 파일을 CSV 형식으로 변환하는 스크립트
|
| 3 |
+
"""
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
# 디렉토리 설정
|
| 9 |
+
data_dir = r'c:\Users\korea\Desktop\dacon_broadcast_paper\data'
|
| 10 |
+
output_dir = r'c:\Users\korea\Desktop\dacon_broadcast_paper\data_csv'
|
| 11 |
+
|
| 12 |
+
# 출력 디렉토리 생성
|
| 13 |
+
if not os.path.exists(output_dir):
|
| 14 |
+
os.makedirs(output_dir)
|
| 15 |
+
print(f"CSV 출력 디렉토리 생성: {output_dir}")
|
| 16 |
+
|
| 17 |
+
# 변환할 파일 목록
|
| 18 |
+
files = [
|
| 19 |
+
'article_metrics_monthly.xlsx',
|
| 20 |
+
'contents.xlsx',
|
| 21 |
+
'demographics_part001.xlsx',
|
| 22 |
+
'demographics_part002.xlsx',
|
| 23 |
+
'referrer.xlsx'
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
print("=" * 80)
|
| 27 |
+
print("Excel → CSV 변환 시작")
|
| 28 |
+
print("=" * 80)
|
| 29 |
+
|
| 30 |
+
for file in files:
|
| 31 |
+
start_time = datetime.now()
|
| 32 |
+
file_path = os.path.join(data_dir, file)
|
| 33 |
+
csv_filename = file.replace('.xlsx', '.csv')
|
| 34 |
+
csv_path = os.path.join(output_dir, csv_filename)
|
| 35 |
+
|
| 36 |
+
print(f"\n[처리 중] {file}")
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Excel 파일 읽기
|
| 40 |
+
df = pd.read_excel(file_path)
|
| 41 |
+
|
| 42 |
+
# CSV로 저장 (UTF-8 with BOM for Excel compatibility)
|
| 43 |
+
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
| 44 |
+
|
| 45 |
+
# 처리 시간 계산
|
| 46 |
+
elapsed = (datetime.now() - start_time).total_seconds()
|
| 47 |
+
|
| 48 |
+
# 결과 출력
|
| 49 |
+
print(f" ✓ 완료: {csv_filename}")
|
| 50 |
+
print(f" - 행 개수: {len(df):,}")
|
| 51 |
+
print(f" - 열 개수: {len(df.columns)}")
|
| 52 |
+
print(f" - 처리 시간: {elapsed:.2f}초")
|
| 53 |
+
print(f" - 저장 경로: {csv_path}")
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f" ✗ 오류 발생: {str(e)}")
|
| 57 |
+
|
| 58 |
+
# demographics 파일 병합 (선택사항)
|
| 59 |
+
print("\n" + "=" * 80)
|
| 60 |
+
print("[추가 작업] demographics 파일 병합")
|
| 61 |
+
print("=" * 80)
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
demo_part1 = pd.read_csv(os.path.join(output_dir, 'demographics_part001.csv'))
|
| 65 |
+
demo_part2 = pd.read_csv(os.path.join(output_dir, 'demographics_part002.csv'))
|
| 66 |
+
|
| 67 |
+
# 두 파일 병합
|
| 68 |
+
demographics_merged = pd.concat([demo_part1, demo_part2], ignore_index=True)
|
| 69 |
+
|
| 70 |
+
# 병합된 파일 저장
|
| 71 |
+
merged_path = os.path.join(output_dir, 'demographics_merged.csv')
|
| 72 |
+
demographics_merged.to_csv(merged_path, index=False, encoding='utf-8-sig')
|
| 73 |
+
|
| 74 |
+
print(f"✓ demographics 병합 완료")
|
| 75 |
+
print(f" - 총 행 개수: {len(demographics_merged):,}")
|
| 76 |
+
print(f" - 저장 경로: {merged_path}")
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"✗ 병합 중 오류 발생: {str(e)}")
|
| 80 |
+
|
| 81 |
+
print("\n" + "=" * 80)
|
| 82 |
+
print("모든 변환 작업 완료!")
|
| 83 |
+
print("=" * 80)
|
data/article_metrics_monthly.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc5d86bdddcb0c22eb82b52d167739a2f548d9d47aad6809154bfda1776962a0
|
| 3 |
+
size 857389
|
data/contents.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:023d9619f1232a138572f19c4411a1e1a226435544710f8b9a68ef2e8f708238
|
| 3 |
+
size 8973538
|
data/demographics_part001.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebd47a1657bddb8b43c6f8015f39ee546bf2bd4adceb3287443bee7a54c99bf1
|
| 3 |
+
size 22162808
|
data/demographics_part002.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d005ab673df2c4e5a5fbea078609276ddf092e520e207aeb5636c3a8857a1ae2
|
| 3 |
+
size 1973477
|
data/referrer.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f349b7c362f1a5d9b598e2d7bde46b4b0417866e84423f9dcf9d736b3b1e0c32
|
| 3 |
+
size 9025763
|
data_csv/article_metrics_monthly.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f526da60ce292fc4fde55d66c5b2314363b6cdb050390902f6ac21d02c288c5
|
| 3 |
+
size 1370521
|
data_csv/contents.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00278be4f929cc7f2d4e1afc9f8ee0222f16c5edee8eec908361dccd54c5019d
|
| 3 |
+
size 21806940
|
data_csv/demographics_merged.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59d65448bbc7953f3d06f6b0d2a081b442fcc6b6f5955c3f72fbbad85a8d41aa
|
| 3 |
+
size 41352606
|
data_csv/demographics_part001.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c94a682cee7a5d81ce4be323a385331180da40552595d6cedfb8008f6f81026d
|
| 3 |
+
size 37956547
|
data_csv/demographics_part002.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeefbdf084d02890ee6f948b916d0c158f93d2c0c57c43c56528435343f654f1
|
| 3 |
+
size 3396110
|
data_csv/referrer.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:751a2be72ae6f95cd344b8b63ebcf8a521fff3a14352d05b9c3e8e643c946c6c
|
| 3 |
+
size 43647022
|
data_structure_analysis.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
신문과방송 독자 데이터 구조 분석 스크립트
|
| 3 |
+
"""
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# 데이터 폴더 경로
|
| 8 |
+
data_dir = r'c:\Users\korea\Desktop\dacon_broadcast_paper\data'
|
| 9 |
+
|
| 10 |
+
# 각 파일 분석
|
| 11 |
+
files = [
|
| 12 |
+
'article_metrics_monthly.xlsx',
|
| 13 |
+
'contents.xlsx',
|
| 14 |
+
'demographics_part001.xlsx',
|
| 15 |
+
'demographics_part002.xlsx',
|
| 16 |
+
'referrer.xlsx'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
print("=" * 80)
|
| 20 |
+
print("신문과방송 독자 데이터 구조 분석")
|
| 21 |
+
print("=" * 80)
|
| 22 |
+
|
| 23 |
+
for file in files:
|
| 24 |
+
file_path = os.path.join(data_dir, file)
|
| 25 |
+
print(f"\n{'='*80}")
|
| 26 |
+
print(f"파일명: {file}")
|
| 27 |
+
print(f"{'='*80}")
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
# Excel 파일 읽기
|
| 31 |
+
df = pd.read_excel(file_path)
|
| 32 |
+
|
| 33 |
+
# 기본 정보
|
| 34 |
+
print(f"\n[기본 정보]")
|
| 35 |
+
print(f"행 개수: {len(df):,}")
|
| 36 |
+
print(f"열 개수: {len(df.columns)}")
|
| 37 |
+
print(f"전체 크기: {df.shape}")
|
| 38 |
+
|
| 39 |
+
# 컬럼 정보
|
| 40 |
+
print(f"\n[컬럼 목록 및 데이터 타입]")
|
| 41 |
+
for idx, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1):
|
| 42 |
+
non_null = df[col].notna().sum()
|
| 43 |
+
null_count = df[col].isna().sum()
|
| 44 |
+
null_pct = (null_count / len(df)) * 100
|
| 45 |
+
print(f"{idx:2d}. {col:40s} | Type: {str(dtype):15s} | Non-Null: {non_null:,} | Null: {null_count:,} ({null_pct:.1f}%)")
|
| 46 |
+
|
| 47 |
+
# 샘플 데이터 (처음 3행)
|
| 48 |
+
print(f"\n[샘플 데이터 (처음 3행)]")
|
| 49 |
+
print(df.head(3).to_string())
|
| 50 |
+
|
| 51 |
+
# 메모리 사용량
|
| 52 |
+
memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
|
| 53 |
+
print(f"\n[메모리 사용량]: {memory_mb:.2f} MB")
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"오류 발생: {str(e)}")
|
| 57 |
+
|
| 58 |
+
print("\n" + "=" * 80)
|
| 59 |
+
print("분석 완료!")
|
| 60 |
+
print("=" * 80)
|
index.html
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="ko">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>기사 성과 예측</title>
|
| 7 |
+
<style>
|
| 8 |
+
*, *::before, *::after {box-sizing: border-box;}
|
| 9 |
+
:root {
|
| 10 |
+
/* Color Palette */
|
| 11 |
+
--font-sans: "Pretendard", "Segoe UI", system-ui, -apple-system, sans-serif;
|
| 12 |
+
|
| 13 |
+
/* Light Mode */
|
| 14 |
+
--bg-light: #f4f6f8; /* 약간 더 부드러운 회색 배경 */
|
| 15 |
+
--card-light: #ffffff;
|
| 16 |
+
--text-primary-light: #111827; /* 진한 차콜 */
|
| 17 |
+
--text-secondary-light: #6b7280; /* 중간 회색 */
|
| 18 |
+
--border-light: #e5e7eb; /* 옅은 회색 테두리 */
|
| 19 |
+
--accent-primary: #3b82f6; /* 차분한 블루 */
|
| 20 |
+
--accent-secondary: #1d4ed8;
|
| 21 |
+
--accent-glow: rgba(59, 130, 246, 0.2);
|
| 22 |
+
|
| 23 |
+
/* Dark Mode */
|
| 24 |
+
--bg-dark: #111827; /* 딥 네이비/차콜 */
|
| 25 |
+
--card-dark: #1f2937; /* 약간 밝은 네이비/차콜 */
|
| 26 |
+
--text-primary-dark: #f9fafb; /* 거의 흰색 */
|
| 27 |
+
--text-secondary-dark: #9ca3af; /* 밝은 회색 */
|
| 28 |
+
--border-dark: #374151;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/* Basic Setup */
|
| 32 |
+
body {
|
| 33 |
+
font-family: var(--font-sans);
|
| 34 |
+
margin: 0;
|
| 35 |
+
padding: 0;
|
| 36 |
+
background-color: var(--bg-light);
|
| 37 |
+
color: var(--text-primary-light);
|
| 38 |
+
display: flex;
|
| 39 |
+
justify-content: center;
|
| 40 |
+
min-height: 100vh;
|
| 41 |
+
-webkit-font-smoothing: antialiased;
|
| 42 |
+
-moz-osx-font-smoothing: grayscale;
|
| 43 |
+
transition: background-color 0.3s ease, color 0.3s ease;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
@media (prefers-color-scheme: dark) {
|
| 47 |
+
body {
|
| 48 |
+
background-color: var(--bg-dark);
|
| 49 |
+
color: var(--text-primary-dark);
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.container {
|
| 54 |
+
width: min(960px, 100% - 40px);
|
| 55 |
+
padding: 48px 20px 64px;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
/* Typography */
|
| 59 |
+
h1 {
|
| 60 |
+
font-size: 2.25rem;
|
| 61 |
+
font-weight: 800;
|
| 62 |
+
letter-spacing: -0.04em;
|
| 63 |
+
color: var(--text-primary-light);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
p.subtitle {
|
| 67 |
+
font-size: 1.125rem;
|
| 68 |
+
margin: 8px 0 40px;
|
| 69 |
+
color: var(--text-secondary-light);
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
h2 {
|
| 73 |
+
font-size: 1.5rem;
|
| 74 |
+
font-weight: 700;
|
| 75 |
+
border-bottom: 1px solid var(--border-light);
|
| 76 |
+
padding-bottom: 12px;
|
| 77 |
+
margin-bottom: 24px;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
h3 {
|
| 81 |
+
font-size: 1.125rem;
|
| 82 |
+
font-weight: 600;
|
| 83 |
+
color: var(--text-primary-light);
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
@media (prefers-color-scheme: dark) {
|
| 87 |
+
h1, h3 { color: var(--text-primary-dark); }
|
| 88 |
+
p.subtitle { color: var(--text-secondary-dark); }
|
| 89 |
+
h2 { border-color: var(--border-dark); }
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
/* Main Card */
|
| 93 |
+
.card {
|
| 94 |
+
background-color: var(--card-light);
|
| 95 |
+
border-radius: 24px;
|
| 96 |
+
padding: 40px;
|
| 97 |
+
box-shadow: 0 8px 16px rgba(0, 0, 0, 0.02), 0 20px 40px rgba(23, 33, 73, 0.08);
|
| 98 |
+
transition: background-color 0.3s ease, box-shadow 0.3s ease;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
@media (prefers-color-scheme: dark) {
|
| 102 |
+
.card {
|
| 103 |
+
background-color: var(--card-dark);
|
| 104 |
+
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.25);
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
/* Form Elements */
|
| 109 |
+
form {
|
| 110 |
+
display: grid;
|
| 111 |
+
gap: 24px;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
label {
|
| 115 |
+
display: block;
|
| 116 |
+
font-weight: 600;
|
| 117 |
+
font-size: 0.9rem;
|
| 118 |
+
margin-bottom: 8px;
|
| 119 |
+
color: var(--text-primary-light);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
@media (prefers-color-scheme: dark) {
|
| 123 |
+
label { color: var(--text-primary-dark); }
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
input[type="text"],
|
| 127 |
+
select,
|
| 128 |
+
textarea {
|
| 129 |
+
width: 100%;
|
| 130 |
+
padding: 12px 16px;
|
| 131 |
+
border-radius: 10px;
|
| 132 |
+
border: 1px solid var(--border-light);
|
| 133 |
+
font-size: 1rem;
|
| 134 |
+
background-color: var(--bg-light);
|
| 135 |
+
color: var(--text-primary-light);
|
| 136 |
+
transition: border-color 0.2s ease, box-shadow 0.2s ease;
|
| 137 |
+
resize: vertical;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
textarea { min-height: 180px; }
|
| 141 |
+
|
| 142 |
+
input:focus,
|
| 143 |
+
textarea:focus,
|
| 144 |
+
select:focus {
|
| 145 |
+
outline: none;
|
| 146 |
+
border-color: var(--accent-primary);
|
| 147 |
+
box-shadow: 0 0 0 3px var(--accent-glow);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
@media (prefers-color-scheme: dark) {
|
| 151 |
+
input[type="text"], select, textarea {
|
| 152 |
+
background-color: #2a3647;
|
| 153 |
+
border-color: var(--border-dark);
|
| 154 |
+
color: var(--text-primary-dark);
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
button {
|
| 159 |
+
background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary));
|
| 160 |
+
color: #fff;
|
| 161 |
+
border: none;
|
| 162 |
+
padding: 14px 24px;
|
| 163 |
+
border-radius: 10px;
|
| 164 |
+
font-size: 1rem;
|
| 165 |
+
font-weight: 600;
|
| 166 |
+
cursor: pointer;
|
| 167 |
+
transition: all 0.2s ease;
|
| 168 |
+
box-shadow: 0 4px 12px rgba(59, 130, 246, 0.2);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
button:hover {
|
| 172 |
+
transform: translateY(-2px);
|
| 173 |
+
box-shadow: 0 8px 20px rgba(59, 130, 246, 0.3);
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
button:disabled {
|
| 177 |
+
opacity: 0.5;
|
| 178 |
+
cursor: not-allowed;
|
| 179 |
+
transform: none;
|
| 180 |
+
box-shadow: none;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
#summarize-btn {
|
| 184 |
+
margin-top: 8px;
|
| 185 |
+
width: 100%;
|
| 186 |
+
background: transparent;
|
| 187 |
+
color: var(--accent-secondary);
|
| 188 |
+
border: 1px solid var(--accent-secondary);
|
| 189 |
+
box-shadow: none;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
#summarize-btn:hover {
|
| 193 |
+
background: rgba(59, 130, 246, 0.1);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
#summarize-btn:disabled {
|
| 197 |
+
opacity: 0.6;
|
| 198 |
+
background: rgba(59, 130, 246, 0.05);
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
/* Results Section */
|
| 202 |
+
#results-container {
|
| 203 |
+
margin-top: 40px;
|
| 204 |
+
padding-top: 40px;
|
| 205 |
+
border-top: 1px solid var(--border-light);
|
| 206 |
+
line-height: 1.65;
|
| 207 |
+
}
|
| 208 |
+
@media (prefers-color-scheme: dark) {
|
| 209 |
+
#results-container { border-color: var(--border-dark); }
|
| 210 |
+
}
|
| 211 |
+
#results-container.hidden { display: none; }
|
| 212 |
+
|
| 213 |
+
.spinner {
|
| 214 |
+
width: 36px;
|
| 215 |
+
height: 36px;
|
| 216 |
+
border: 4px solid var(--accent-glow);
|
| 217 |
+
border-top-color: var(--accent-primary);
|
| 218 |
+
border-radius: 50%;
|
| 219 |
+
animation: spin 1s linear infinite;
|
| 220 |
+
margin: 32px auto;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 224 |
+
|
| 225 |
+
.error-message { color: #f43f5e; font-weight: 600; }
|
| 226 |
+
.results-section { margin-bottom: 40px; }
|
| 227 |
+
|
| 228 |
+
.ai-card {
|
| 229 |
+
background-color: #eff6ff;
|
| 230 |
+
border-radius: 16px;
|
| 231 |
+
padding: 24px;
|
| 232 |
+
border: 1px solid #dbeafe;
|
| 233 |
+
}
|
| 234 |
+
.ai-card ul { padding-left: 20px; color: var(--text-secondary-light); }
|
| 235 |
+
.ai-card li strong { color: var(--text-primary-light); }
|
| 236 |
+
.ai-card li .summary {
|
| 237 |
+
font-size: 0.9rem;
|
| 238 |
+
color: var(--text-secondary-light);
|
| 239 |
+
margin: 6px 0 0;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
@media (prefers-color-scheme: dark) {
|
| 243 |
+
.ai-card {
|
| 244 |
+
background-color: #1e293b;
|
| 245 |
+
border-color: #334155;
|
| 246 |
+
}
|
| 247 |
+
.ai-card ul { color: var(--text-secondary-dark); }
|
| 248 |
+
.ai-card li strong { color: var(--text-primary-dark); }
|
| 249 |
+
.ai-card li .summary { color: var(--text-secondary-dark); }
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
/* SEO Simulation */
|
| 253 |
+
.seo-grid {
|
| 254 |
+
display: grid;
|
| 255 |
+
grid-template-columns: 1fr 1fr;
|
| 256 |
+
gap: 24px;
|
| 257 |
+
}
|
| 258 |
+
@media (max-width: 768px) {
|
| 259 |
+
.seo-grid { grid-template-columns: 1fr; }
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
.seo-column {
|
| 263 |
+
border: 1px solid var(--border-light);
|
| 264 |
+
border-radius: 16px;
|
| 265 |
+
padding: 24px;
|
| 266 |
+
}
|
| 267 |
+
@media (prefers-color-scheme: dark) {
|
| 268 |
+
.seo-column { border-color: var(--border-dark); }
|
| 269 |
+
}
|
| 270 |
+
.seo-column h3 { margin-top: 0; }
|
| 271 |
+
|
| 272 |
+
/* SERP Preview */
|
| 273 |
+
.serp-preview {
|
| 274 |
+
background-color: var(--card-light);
|
| 275 |
+
border-radius: 8px;
|
| 276 |
+
padding: 16px;
|
| 277 |
+
border: 1px solid var(--border-light);
|
| 278 |
+
margin-bottom: 24px;
|
| 279 |
+
font-family: Arial, sans-serif;
|
| 280 |
+
}
|
| 281 |
+
.serp-preview .serp-url { color: #1e8e3e; font-size: 0.875rem; }
|
| 282 |
+
.serp-preview .serp-title { color: #1a0dab; font-size: 1.25rem; font-weight: 400; margin: 4px 0; }
|
| 283 |
+
.serp-preview .serp-description { color: #4d5156; font-size: 0.875rem; line-height: 1.5; }
|
| 284 |
+
|
| 285 |
+
@media (prefers-color-scheme: dark) {
|
| 286 |
+
.serp-preview {
|
| 287 |
+
background-color: #2d3748;
|
| 288 |
+
border-color: var(--border-dark);
|
| 289 |
+
}
|
| 290 |
+
.serp-preview .serp-url { color: #98c379; }
|
| 291 |
+
.serp-preview .serp-title { color: #8ab4f8; }
|
| 292 |
+
.serp-preview .serp-description { color: #bdc1c6; }
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
.seo-input-group { margin-top: 16px; }
|
| 296 |
+
|
| 297 |
+
.counter {
|
| 298 |
+
font-size: 0.8rem;
|
| 299 |
+
color: var(--text-secondary-light);
|
| 300 |
+
text-align: right;
|
| 301 |
+
margin-top: 4px;
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
@media (prefers-color-scheme: dark) {
|
| 305 |
+
.counter { color: var(--text-secondary-dark); }
|
| 306 |
+
}
|
| 307 |
+
</style>
|
| 308 |
+
</head>
|
| 309 |
+
<body>
|
| 310 |
+
<main class="container">
|
| 311 |
+
<section class="card">
|
| 312 |
+
<h1>기사 성과 예측</h1>
|
| 313 |
+
<p class="subtitle">기사 내용을 입력하면 예상 조회수, 핵심 독자층, 유사 기사를 추천해 드립니다.</p>
|
| 314 |
+
|
| 315 |
+
<form id="prediction-form" autocomplete="off">
|
| 316 |
+
<div>
|
| 317 |
+
<label for="title">기사 제목</label>
|
| 318 |
+
<input type="text" id="title" name="title" placeholder="예: AI 기술, 언론 제작 방식 대전환" required />
|
| 319 |
+
</div>
|
| 320 |
+
|
| 321 |
+
<div>
|
| 322 |
+
<label for="category">카테고리</label>
|
| 323 |
+
<select id="category" name="category" required>
|
| 324 |
+
<option value="" disabled selected>카테고리를 선택하세요</option>
|
| 325 |
+
</select>
|
| 326 |
+
</div>
|
| 327 |
+
|
| 328 |
+
<div>
|
| 329 |
+
<label for="content">기사 본문</label>
|
| 330 |
+
<textarea id="content" name="content" placeholder="핵심 내용과 주요 포인트를 입력하세요." required></textarea>
|
| 331 |
+
</div>
|
| 332 |
+
|
| 333 |
+
<button type="submit">분석하기</button>
|
| 334 |
+
</form>
|
| 335 |
+
|
| 336 |
+
<div id="results-container" class="hidden"></div>
|
| 337 |
+
</section>
|
| 338 |
+
</main>
|
| 339 |
+
|
| 340 |
+
<script>
|
| 341 |
+
const form = document.getElementById("prediction-form");
|
| 342 |
+
const resultsContainer = document.getElementById("results-container");
|
| 343 |
+
const categorySelect = document.getElementById("category");
|
| 344 |
+
const articleContentInput = document.getElementById("content");
|
| 345 |
+
|
| 346 |
+
const MAX_TITLE_LENGTH = 60;
|
| 347 |
+
const MAX_DESCRIPTION_LENGTH = 150;
|
| 348 |
+
|
| 349 |
+
const renderLoading = () => {
|
| 350 |
+
resultsContainer.classList.remove("hidden");
|
| 351 |
+
resultsContainer.innerHTML = `
|
| 352 |
+
<div class="spinner"></div>
|
| 353 |
+
<p style="text-align:center;">AI가 분석 중입니다...</p>
|
| 354 |
+
`;
|
| 355 |
+
};
|
| 356 |
+
|
| 357 |
+
const renderError = (message) => {
|
| 358 |
+
resultsContainer.classList.remove("hidden");
|
| 359 |
+
resultsContainer.innerHTML = `<p class="error-message">⚠️ ${message}</p>`;
|
| 360 |
+
};
|
| 361 |
+
|
| 362 |
+
const createSimilarArticlesList = (similarArticles = []) => {
|
| 363 |
+
if (!similarArticles.length) {
|
| 364 |
+
return "<li>유사 기사를 찾지 못했습니다.</li>";
|
| 365 |
+
}
|
| 366 |
+
return similarArticles
|
| 367 |
+
.map((article) => {
|
| 368 |
+
const title = article.title ?? "제목 미상";
|
| 369 |
+
const summary = article.summary ? `${article.summary}...` : "요약을 제공할 수 없습니다.";
|
| 370 |
+
return `
|
| 371 |
+
<li>
|
| 372 |
+
<strong>${title}</strong>
|
| 373 |
+
<p class="summary">${summary}</p>
|
| 374 |
+
</li>
|
| 375 |
+
`;
|
| 376 |
+
})
|
| 377 |
+
.join("");
|
| 378 |
+
};
|
| 379 |
+
|
| 380 |
+
const renderResults = (data) => {
|
| 381 |
+
const { ai_prediction, seo_simulation } = data;
|
| 382 |
+
const { predicted_views, predicted_age_group, similar_articles } = ai_prediction || {};
|
| 383 |
+
const { current_state, recommended_state } = seo_simulation || {};
|
| 384 |
+
|
| 385 |
+
resultsContainer.classList.remove("hidden");
|
| 386 |
+
resultsContainer.innerHTML = `
|
| 387 |
+
<section class="results-section ai-card">
|
| 388 |
+
<h2>AI 예측 결과</h2>
|
| 389 |
+
<h3>예상 조회수: <strong>${(predicted_views ?? 0).toLocaleString()}회</strong></h3>
|
| 390 |
+
<h3>핵심 독자층: <strong>${predicted_age_group ?? "N/A"}</strong></h3>
|
| 391 |
+
<h3>유사 기사 추천</h3>
|
| 392 |
+
<ul>
|
| 393 |
+
${createSimilarArticlesList(similar_articles)}
|
| 394 |
+
</ul>
|
| 395 |
+
</section>
|
| 396 |
+
|
| 397 |
+
<section class="results-section">
|
| 398 |
+
<h2>SEO 시뮬레이션</h2>
|
| 399 |
+
<div class="seo-grid">
|
| 400 |
+
<div class="seo-column">
|
| 401 |
+
<h3>현재 노출 상태 (Before)</h3>
|
| 402 |
+
<div class="serp-preview">
|
| 403 |
+
<p class="serp-url">press.example.com/article</p>
|
| 404 |
+
<p class="serp-title">${"한국언론진흥재단 전체기사"}</p>
|
| 405 |
+
<p class="serp-description">${"설명이 제공되지 않았습니다."}</p>
|
| 406 |
+
</div>
|
| 407 |
+
<p>${current_state?.issue ?? "현재 문제를 파악하지 못했습니다."}</p>
|
| 408 |
+
</div>
|
| 409 |
+
|
| 410 |
+
<div class="seo-column">
|
| 411 |
+
<h3>추천 개선안 (After)</h3>
|
| 412 |
+
<div class="serp-preview" id="after-preview">
|
| 413 |
+
<p class="serp-url">press.example.com/article</p>
|
| 414 |
+
<p class="serp-title" id="after-title-preview"></p>
|
| 415 |
+
<p class="serp-description" id="after-description-preview"></p>
|
| 416 |
+
</div>
|
| 417 |
+
|
| 418 |
+
<div class="seo-input-group">
|
| 419 |
+
<div>
|
| 420 |
+
<label for="seo-title-input">추천 제목</label>
|
| 421 |
+
<input type="text" id="seo-title-input" maxlength="80" />
|
| 422 |
+
<div class="counter" id="title-counter"></div>
|
| 423 |
+
</div>
|
| 424 |
+
|
| 425 |
+
<div>
|
| 426 |
+
<label for="seo-description-input">추천 설명</label>
|
| 427 |
+
<textarea id="seo-description-input" rows="4" maxlength="200"></textarea>
|
| 428 |
+
<div class="counter" id="description-counter"></div>
|
| 429 |
+
<button type="button" id="summarize-btn">본문 내용으로 설명 다시 생성하기 ↺</button>
|
| 430 |
+
</div>
|
| 431 |
+
</div>
|
| 432 |
+
|
| 433 |
+
<p>추천 제목과 설명을 조정하여 SERP에서의 가독성과 클릭 유도 문구를 최적화할 수 있습니다.</p>
|
| 434 |
+
</div>
|
| 435 |
+
</div>
|
| 436 |
+
</section>
|
| 437 |
+
`;
|
| 438 |
+
|
| 439 |
+
const seoTitleInput = document.getElementById("seo-title-input");
|
| 440 |
+
const seoDescriptionInput = document.getElementById("seo-description-input");
|
| 441 |
+
const afterTitlePreview = document.getElementById("after-title-preview");
|
| 442 |
+
const afterDescriptionPreview = document.getElementById("after-description-preview");
|
| 443 |
+
const titleCounter = document.getElementById("title-counter");
|
| 444 |
+
const descriptionCounter = document.getElementById("description-counter");
|
| 445 |
+
const summarizeBtn = document.getElementById("summarize-btn");
|
| 446 |
+
|
| 447 |
+
const initialTitle = recommended_state?.title ?? "검색 엔진 친화적인 기사 제목";
|
| 448 |
+
const initialDescription = recommended_state?.description ?? "핵심 키워드와 요약 내용을 포함하는 설명을 작성해 보세요.";
|
| 449 |
+
|
| 450 |
+
seoTitleInput.maxLength = MAX_TITLE_LENGTH;
|
| 451 |
+
seoDescriptionInput.maxLength = MAX_DESCRIPTION_LENGTH;
|
| 452 |
+
seoTitleInput.value = initialTitle;
|
| 453 |
+
seoDescriptionInput.value = initialDescription;
|
| 454 |
+
|
| 455 |
+
const updateTitlePreview = () => {
|
| 456 |
+
const value = seoTitleInput.value;
|
| 457 |
+
afterTitlePreview.textContent = value;
|
| 458 |
+
titleCounter.textContent = `${value.length} / ${MAX_TITLE_LENGTH}`;
|
| 459 |
+
};
|
| 460 |
+
|
| 461 |
+
const updateDescriptionPreview = () => {
|
| 462 |
+
const value = seoDescriptionInput.value;
|
| 463 |
+
afterDescriptionPreview.textContent = value;
|
| 464 |
+
descriptionCounter.textContent = `${value.length} / ${MAX_DESCRIPTION_LENGTH}`;
|
| 465 |
+
};
|
| 466 |
+
|
| 467 |
+
seoTitleInput.addEventListener("input", updateTitlePreview);
|
| 468 |
+
seoDescriptionInput.addEventListener("input", updateDescriptionPreview);
|
| 469 |
+
|
| 470 |
+
if (summarizeBtn) {
|
| 471 |
+
summarizeBtn.addEventListener("click", async () => {
|
| 472 |
+
const contentValue = articleContentInput.value.trim();
|
| 473 |
+
if (!contentValue) {
|
| 474 |
+
alert("본문 내용을 입력한 뒤 다시 시도해주세요.");
|
| 475 |
+
return;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
const originalText = summarizeBtn.textContent;
|
| 479 |
+
summarizeBtn.disabled = true;
|
| 480 |
+
summarizeBtn.textContent = "생성 중...";
|
| 481 |
+
|
| 482 |
+
try {
|
| 483 |
+
const response = await fetch("/generate-description", {
|
| 484 |
+
method: "POST",
|
| 485 |
+
headers: {
|
| 486 |
+
"Content-Type": "application/json",
|
| 487 |
+
},
|
| 488 |
+
body: JSON.stringify({ content: contentValue }),
|
| 489 |
+
});
|
| 490 |
+
|
| 491 |
+
if (!response.ok) {
|
| 492 |
+
const { error } = await response.json();
|
| 493 |
+
throw new Error(error || "설명을 생성하지 못했습니다.");
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
const seoData = await response.json();
|
| 497 |
+
seoTitleInput.value = seoData.title ?? "";
|
| 498 |
+
seoDescriptionInput.value = seoData.description ?? "";
|
| 499 |
+
seoTitleInput.dispatchEvent(new Event("input"));
|
| 500 |
+
seoDescriptionInput.dispatchEvent(new Event("input"));
|
| 501 |
+
} catch (error) {
|
| 502 |
+
console.error(error);
|
| 503 |
+
alert("AI 설명 생성 중 문제가 발생했습니다. 잠시 후 다시 시도해주세요.");
|
| 504 |
+
} finally {
|
| 505 |
+
summarizeBtn.disabled = false;
|
| 506 |
+
summarizeBtn.textContent = originalText;
|
| 507 |
+
}
|
| 508 |
+
});
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
updateTitlePreview();
|
| 512 |
+
updateDescriptionPreview();
|
| 513 |
+
};
|
| 514 |
+
|
| 515 |
+
const fetchCategories = async () => {
|
| 516 |
+
try {
|
| 517 |
+
const response = await fetch("/categories");
|
| 518 |
+
if (!response.ok) throw new Error("카테고리 정보를 불러오지 못했습니다.");
|
| 519 |
+
const { categories } = await response.json();
|
| 520 |
+
|
| 521 |
+
if (Array.isArray(categories)) {
|
| 522 |
+
categories.forEach((category) => {
|
| 523 |
+
const option = document.createElement("option");
|
| 524 |
+
option.value = category;
|
| 525 |
+
option.textContent = category;
|
| 526 |
+
categorySelect.appendChild(option);
|
| 527 |
+
});
|
| 528 |
+
}
|
| 529 |
+
} catch (error) {
|
| 530 |
+
console.error(error);
|
| 531 |
+
renderError("카테고리 로딩 중 문제가 발생했습니다. 새로고침 후 다시 시도해주세요.");
|
| 532 |
+
}
|
| 533 |
+
};
|
| 534 |
+
|
| 535 |
+
form.addEventListener("submit", async (event) => {
|
| 536 |
+
event.preventDefault();
|
| 537 |
+
|
| 538 |
+
const title = form.title.value.trim();
|
| 539 |
+
const content = form.content.value.trim();
|
| 540 |
+
const category = form.category.value;
|
| 541 |
+
|
| 542 |
+
if (!title || !content || !category) {
|
| 543 |
+
renderError("제목, 카테고리, 본문을 모두 입력해주세요.");
|
| 544 |
+
return;
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
renderLoading();
|
| 548 |
+
const submitButton = form.querySelector("button[type='submit']");
|
| 549 |
+
submitButton.disabled = true;
|
| 550 |
+
const originalButtonText = submitButton.textContent;
|
| 551 |
+
submitButton.textContent = "AI로 분석 중...";
|
| 552 |
+
|
| 553 |
+
try {
|
| 554 |
+
const response = await fetch("/predict", {
|
| 555 |
+
method: "POST",
|
| 556 |
+
headers: {
|
| 557 |
+
"Content-Type": "application/json",
|
| 558 |
+
},
|
| 559 |
+
body: JSON.stringify({ title, content, category }),
|
| 560 |
+
});
|
| 561 |
+
|
| 562 |
+
if (!response.ok) {
|
| 563 |
+
const { error } = await response.json();
|
| 564 |
+
throw new Error(error || "예측 요청에 실패했습니다.");
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
const payload = await response.json();
|
| 568 |
+
renderResults(payload);
|
| 569 |
+
} catch (error) {
|
| 570 |
+
console.error(error);
|
| 571 |
+
renderError(error.message || "서버 요청 중 오류가 발생했습니다.");
|
| 572 |
+
} finally {
|
| 573 |
+
submitButton.disabled = false;
|
| 574 |
+
submitButton.textContent = originalButtonText;
|
| 575 |
+
}
|
| 576 |
+
});
|
| 577 |
+
|
| 578 |
+
fetchCategories();
|
| 579 |
+
</script>
|
| 580 |
+
</body>
|
| 581 |
+
</html>
|
label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:180c6e96a643e0da6875e8327c9da6e651ba88b216a373f6e9d85679b51e2bb8
|
| 3 |
+
size 558
|
onehot_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7eb648fceb784e272c35c0eb24810303b39143c1a5f130e2c50ae7bb23405c25
|
| 3 |
+
size 1452
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask>=3.0.0
|
| 2 |
+
pandas>=2.1.0
|
| 3 |
+
numpy>=1.25.0
|
| 4 |
+
scikit-learn>=1.3.0
|
| 5 |
+
scipy>=1.11.0
|
| 6 |
+
joblib>=1.3.0
|
| 7 |
+
xgboost>=1.7.6
|
| 8 |
+
konlpy>=0.6.0
|
| 9 |
+
mecab-python3>=1.0.6
|
| 10 |
+
python-dotenv>=1.0.0
|
| 11 |
+
google-generativeai>=0.3.1
|
text_features_matrix.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7a71eb8a520c1b4dda4b05ba436c653225aa689e9d13b68d7a572013f6d62ba
|
| 3 |
+
size 8837035
|
tfidf_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74b3b6ff74dd41b1357013d6326534ee0a3fd0df6c0b9ff57e9bb5563869fc5f
|
| 3 |
+
size 185133
|
train_and_save_models.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Training pipeline for the "신문과방송" article performance prediction project.
|
| 2 |
+
|
| 3 |
+
This script prepares the datasets, engineers features using Okt-powered
|
| 4 |
+
TF-IDF and categorical encodings, trains XGBoost models for view-count and
|
| 5 |
+
primary audience prediction, and persists all artifacts required by the Flask
|
| 6 |
+
inference service.
|
| 7 |
+
|
| 8 |
+
The script is intended to be executed once the raw CSV files are available in
|
| 9 |
+
`data_csv/`. Running it will generate the following files in the project root:
|
| 10 |
+
|
| 11 |
+
- tfidf_vectorizer.pkl
|
| 12 |
+
- onehot_encoder.pkl
|
| 13 |
+
- label_encoder.pkl
|
| 14 |
+
- view_prediction_model.pkl
|
| 15 |
+
- age_prediction_model.pkl
|
| 16 |
+
- text_features_matrix.pkl
|
| 17 |
+
- article_mapping.pkl
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import sys
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import List, Optional, Tuple, cast
|
| 24 |
+
|
| 25 |
+
import joblib
|
| 26 |
+
import numpy as np
|
| 27 |
+
import pandas as pd
|
| 28 |
+
from konlpy.tag import Okt
|
| 29 |
+
from scipy.sparse import csr_matrix, hstack
|
| 30 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 31 |
+
from sklearn.metrics import accuracy_score, mean_absolute_error
|
| 32 |
+
from sklearn.model_selection import train_test_split
|
| 33 |
+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
| 34 |
+
from xgboost import XGBClassifier, XGBRegressor
|
| 35 |
+
|
| 36 |
+
DATA_DIR = Path("data_csv")
|
| 37 |
+
CONTENTS_PATH = DATA_DIR / "contents.csv"
|
| 38 |
+
METRICS_PATH = DATA_DIR / "article_metrics_monthly.csv"
|
| 39 |
+
DEMOGRAPHICS_PATH = DATA_DIR / "demographics_merged.csv"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def ensure_files_exist(paths: List[Path]) -> None:
|
| 43 |
+
"""Raise a helpful error if any expected data file is missing."""
|
| 44 |
+
missing = [str(path) for path in paths if not path.exists()]
|
| 45 |
+
if missing:
|
| 46 |
+
raise FileNotFoundError(
|
| 47 |
+
"Missing required data files: " + ", ".join(missing)
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
OKT = Okt()
|
| 51 |
+
|
| 52 |
+
def okt_tokenizer(text):
|
| 53 |
+
"""Define tokenizer using Okt that extracts nouns and verbs."""
|
| 54 |
+
if not text.strip():
|
| 55 |
+
return []
|
| 56 |
+
# Extract nouns and verbs
|
| 57 |
+
return [word for word, tag in OKT.pos(text, stem=True) if tag in ['Noun', 'Verb']]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def load_datasets() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 61 |
+
print("[1/6] Loading datasets...")
|
| 62 |
+
contents = pd.read_csv(CONTENTS_PATH)
|
| 63 |
+
metrics = pd.read_csv(METRICS_PATH)
|
| 64 |
+
demographics = pd.read_csv(DEMOGRAPHICS_PATH)
|
| 65 |
+
return contents, metrics, demographics
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def aggregate_metrics(metrics: pd.DataFrame) -> pd.DataFrame:
|
| 69 |
+
print("[2/6] Aggregating article metrics...")
|
| 70 |
+
agg = (
|
| 71 |
+
metrics.groupby("article_id", as_index=False)[["views_total", "comments", "likes"]]
|
| 72 |
+
.sum()
|
| 73 |
+
.rename(columns={
|
| 74 |
+
"views_total": "views_total",
|
| 75 |
+
"comments": "comments_total",
|
| 76 |
+
"likes": "likes_total",
|
| 77 |
+
})
|
| 78 |
+
)
|
| 79 |
+
return agg
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def identify_primary_audience(demographics: pd.DataFrame) -> pd.DataFrame:
|
| 83 |
+
print("[3/6] Identifying primary audience age groups...")
|
| 84 |
+
filtered = demographics[demographics["age_group"] != "전체"].copy()
|
| 85 |
+
if filtered.empty:
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"No demographic records found after excluding '전체'."
|
| 88 |
+
)
|
| 89 |
+
filtered.sort_values(["article_id", "views"], ascending=[True, False], inplace=True)
|
| 90 |
+
idx = filtered.groupby("article_id")["views"].idxmax()
|
| 91 |
+
primary = (
|
| 92 |
+
filtered.loc[idx, ["article_id", "age_group"]]
|
| 93 |
+
.rename(columns={"age_group": "primary_age_group"})
|
| 94 |
+
.reset_index(drop=True)
|
| 95 |
+
)
|
| 96 |
+
return primary
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def build_master_dataframe(
|
| 100 |
+
contents: pd.DataFrame,
|
| 101 |
+
metrics_agg: pd.DataFrame,
|
| 102 |
+
primary_audience: pd.DataFrame,
|
| 103 |
+
) -> pd.DataFrame:
|
| 104 |
+
print("[4/6] Merging datasets...")
|
| 105 |
+
df_master = contents.merge(metrics_agg, on="article_id", how="left")
|
| 106 |
+
df_master = df_master.merge(primary_audience, on="article_id", how="left")
|
| 107 |
+
|
| 108 |
+
# Replace missing numeric metrics with zeros for downstream processing.
|
| 109 |
+
for column in ["views_total", "comments_total", "likes_total"]:
|
| 110 |
+
if column in df_master.columns:
|
| 111 |
+
df_master[column] = df_master[column].fillna(0)
|
| 112 |
+
|
| 113 |
+
return df_master
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def engineer_features(df_master: pd.DataFrame) -> tuple[csr_matrix, csr_matrix, TfidfVectorizer, OneHotEncoder]:
|
| 117 |
+
print("[5/6] Engineering features (text + category)...")
|
| 118 |
+
text_series = (
|
| 119 |
+
df_master["title"].fillna("") + " " + df_master["content"].fillna("")
|
| 120 |
+
).str.strip()
|
| 121 |
+
|
| 122 |
+
vectorizer = TfidfVectorizer(
|
| 123 |
+
tokenizer=okt_tokenizer,
|
| 124 |
+
max_features=5000,
|
| 125 |
+
lowercase=False,
|
| 126 |
+
)
|
| 127 |
+
X_text = vectorizer.fit_transform(text_series)
|
| 128 |
+
X_text_csr = csr_matrix(X_text)
|
| 129 |
+
|
| 130 |
+
category_series = df_master["category"].fillna("미분류")
|
| 131 |
+
onehot_encoder = OneHotEncoder(handle_unknown="ignore")
|
| 132 |
+
X_cat = onehot_encoder.fit_transform(category_series.to_frame())
|
| 133 |
+
|
| 134 |
+
X_combined = cast(csr_matrix, hstack([X_text_csr, X_cat]).tocsr())
|
| 135 |
+
return X_combined, X_text_csr, vectorizer, onehot_encoder
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def prepare_targets(
|
| 139 |
+
df_master: pd.DataFrame,
|
| 140 |
+
X_combined: csr_matrix,
|
| 141 |
+
X_text: csr_matrix,
|
| 142 |
+
) -> tuple[csr_matrix, csr_matrix, np.ndarray, np.ndarray, LabelEncoder, pd.DataFrame]:
|
| 143 |
+
print("[6/6] Preparing targets and filtering valid samples...")
|
| 144 |
+
y_views = df_master["views_total"].fillna(0).to_numpy(dtype=np.float32)
|
| 145 |
+
y_age = df_master["primary_age_group"]
|
| 146 |
+
|
| 147 |
+
valid_mask = y_age.notna().to_numpy()
|
| 148 |
+
if not valid_mask.any():
|
| 149 |
+
raise ValueError(
|
| 150 |
+
"No samples contain a primary audience label. Unable to train the classification model."
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
X_combined_valid = X_combined[valid_mask]
|
| 154 |
+
X_text_valid = X_text[valid_mask]
|
| 155 |
+
y_views_valid = y_views[valid_mask]
|
| 156 |
+
y_age_valid = y_age[valid_mask].astype(str)
|
| 157 |
+
|
| 158 |
+
label_encoder = LabelEncoder()
|
| 159 |
+
y_age_encoded = np.asarray(label_encoder.fit_transform(y_age_valid), dtype=np.int32)
|
| 160 |
+
|
| 161 |
+
article_mapping = df_master.loc[valid_mask, ["article_id", "title"]].reset_index(drop=True)
|
| 162 |
+
|
| 163 |
+
return (
|
| 164 |
+
X_combined_valid,
|
| 165 |
+
X_text_valid,
|
| 166 |
+
y_views_valid,
|
| 167 |
+
y_age_encoded,
|
| 168 |
+
label_encoder,
|
| 169 |
+
article_mapping,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def train_models(
|
| 174 |
+
X_features: csr_matrix,
|
| 175 |
+
y_views: np.ndarray,
|
| 176 |
+
y_age_encoded: np.ndarray,
|
| 177 |
+
num_classes: int,
|
| 178 |
+
) -> tuple[XGBRegressor, XGBClassifier]:
|
| 179 |
+
print("Training XGBoost models with validation split...")
|
| 180 |
+
|
| 181 |
+
stratify_target = y_age_encoded if len(np.unique(y_age_encoded)) > 1 else None
|
| 182 |
+
|
| 183 |
+
(
|
| 184 |
+
X_train,
|
| 185 |
+
X_valid,
|
| 186 |
+
y_views_train,
|
| 187 |
+
y_views_valid,
|
| 188 |
+
y_age_train,
|
| 189 |
+
y_age_valid,
|
| 190 |
+
) = train_test_split(
|
| 191 |
+
X_features,
|
| 192 |
+
y_views,
|
| 193 |
+
y_age_encoded,
|
| 194 |
+
test_size=0.2,
|
| 195 |
+
random_state=42,
|
| 196 |
+
stratify=stratify_target,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
view_model = XGBRegressor(
|
| 200 |
+
objective="reg:squarederror",
|
| 201 |
+
n_estimators=200,
|
| 202 |
+
learning_rate=0.1,
|
| 203 |
+
max_depth=6,
|
| 204 |
+
subsample=0.8,
|
| 205 |
+
colsample_bytree=0.8,
|
| 206 |
+
random_state=42,
|
| 207 |
+
tree_method="hist",
|
| 208 |
+
n_jobs=-1,
|
| 209 |
+
)
|
| 210 |
+
view_model.fit(X_train, y_views_train)
|
| 211 |
+
|
| 212 |
+
age_model = XGBClassifier(
|
| 213 |
+
objective="multi:softprob",
|
| 214 |
+
num_class=num_classes,
|
| 215 |
+
n_estimators=300,
|
| 216 |
+
learning_rate=0.1,
|
| 217 |
+
max_depth=6,
|
| 218 |
+
subsample=0.8,
|
| 219 |
+
colsample_bytree=0.8,
|
| 220 |
+
random_state=42,
|
| 221 |
+
tree_method="hist",
|
| 222 |
+
n_jobs=-1,
|
| 223 |
+
eval_metric="mlogloss",
|
| 224 |
+
use_label_encoder=False,
|
| 225 |
+
)
|
| 226 |
+
age_model.fit(X_train, y_age_train)
|
| 227 |
+
|
| 228 |
+
if X_valid.shape[0] > 0:
|
| 229 |
+
view_pred = view_model.predict(X_valid)
|
| 230 |
+
mae = mean_absolute_error(y_views_valid, view_pred)
|
| 231 |
+
age_pred = age_model.predict(X_valid)
|
| 232 |
+
acc = accuracy_score(y_age_valid, age_pred)
|
| 233 |
+
print(f" - Validation MAE (views): {mae:,.2f}")
|
| 234 |
+
print(f" - Validation Accuracy (audience): {acc:.4f}")
|
| 235 |
+
|
| 236 |
+
# Refit on the full dataset to maximise performance for saved artifacts.
|
| 237 |
+
view_model.fit(X_features, y_views)
|
| 238 |
+
age_model.fit(X_features, y_age_encoded)
|
| 239 |
+
|
| 240 |
+
return view_model, age_model
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def save_artifacts(
|
| 244 |
+
vectorizer: TfidfVectorizer,
|
| 245 |
+
onehot_encoder: OneHotEncoder,
|
| 246 |
+
label_encoder: LabelEncoder,
|
| 247 |
+
view_model: XGBRegressor,
|
| 248 |
+
age_model: XGBClassifier,
|
| 249 |
+
text_features: csr_matrix,
|
| 250 |
+
article_mapping: pd.DataFrame,
|
| 251 |
+
) -> None:
|
| 252 |
+
print("Saving artifacts...")
|
| 253 |
+
|
| 254 |
+
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
|
| 255 |
+
print("- Saved tfidf_vectorizer.pkl")
|
| 256 |
+
|
| 257 |
+
joblib.dump(onehot_encoder, "onehot_encoder.pkl")
|
| 258 |
+
print("- Saved onehot_encoder.pkl")
|
| 259 |
+
|
| 260 |
+
joblib.dump(label_encoder, "label_encoder.pkl")
|
| 261 |
+
print("- Saved label_encoder.pkl")
|
| 262 |
+
|
| 263 |
+
joblib.dump(view_model, "view_prediction_model.pkl")
|
| 264 |
+
print("- Saved view_prediction_model.pkl")
|
| 265 |
+
|
| 266 |
+
joblib.dump(age_model, "age_prediction_model.pkl")
|
| 267 |
+
print("- Saved age_prediction_model.pkl")
|
| 268 |
+
|
| 269 |
+
joblib.dump(text_features, "text_features_matrix.pkl")
|
| 270 |
+
print("- Saved text_features_matrix.pkl")
|
| 271 |
+
|
| 272 |
+
joblib.dump(article_mapping, "article_mapping.pkl")
|
| 273 |
+
print("- Saved article_mapping.pkl")
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def main() -> None:
|
| 277 |
+
np.random.seed(42)
|
| 278 |
+
|
| 279 |
+
ensure_files_exist([CONTENTS_PATH, METRICS_PATH, DEMOGRAPHICS_PATH])
|
| 280 |
+
|
| 281 |
+
contents, metrics, demographics = load_datasets()
|
| 282 |
+
metrics_agg = aggregate_metrics(metrics)
|
| 283 |
+
primary_audience = identify_primary_audience(demographics)
|
| 284 |
+
df_master = build_master_dataframe(contents, metrics_agg, primary_audience)
|
| 285 |
+
|
| 286 |
+
X_combined, X_text, vectorizer, onehot_encoder = engineer_features(df_master)
|
| 287 |
+
(
|
| 288 |
+
X_features,
|
| 289 |
+
X_text_filtered,
|
| 290 |
+
y_views,
|
| 291 |
+
y_age_encoded,
|
| 292 |
+
label_encoder,
|
| 293 |
+
article_mapping,
|
| 294 |
+
) = prepare_targets(df_master, X_combined, X_text)
|
| 295 |
+
|
| 296 |
+
view_model, age_model = train_models(
|
| 297 |
+
X_features,
|
| 298 |
+
y_views,
|
| 299 |
+
y_age_encoded,
|
| 300 |
+
num_classes=len(label_encoder.classes_),
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
save_artifacts(
|
| 304 |
+
vectorizer,
|
| 305 |
+
onehot_encoder,
|
| 306 |
+
label_encoder,
|
| 307 |
+
view_model,
|
| 308 |
+
age_model,
|
| 309 |
+
X_text_filtered,
|
| 310 |
+
article_mapping,
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
print("All artifacts saved successfully.")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
if __name__ == "__main__":
|
| 317 |
+
try:
|
| 318 |
+
main()
|
| 319 |
+
except Exception as exc: # pragma: no cover - top-level execution guard.
|
| 320 |
+
print(f"Error: {exc}", file=sys.stderr)
|
| 321 |
+
raise
|
view_prediction_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9b2aa61b6a5c09b71cd709a7e1c550511078c1c887ea9d6ac4e7afc11fa9b37
|
| 3 |
+
size 502931
|
wsgi.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""WSGI entrypoint for production servers.
|
| 2 |
+
|
| 3 |
+
Expose the Flask application as ``application`` so Gunicorn or other WSGI-compatible
|
| 4 |
+
servers can import it via ``wsgi:application``.
|
| 5 |
+
"""
|
| 6 |
+
from app import app as application
|
| 7 |
+
|
| 8 |
+
if __name__ == "__main__": # pragma: no cover
|
| 9 |
+
application.run()
|