Spaces:
Runtime error
Runtime error
updates
Browse files- dvc.yaml +19 -11
- src/data/process_data.py +9 -2
dvc.yaml
CHANGED
|
@@ -1,4 +1,22 @@
|
|
| 1 |
stages:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
train:
|
| 3 |
cmd: python src/models/train_model.py
|
| 4 |
deps:
|
|
@@ -20,14 +38,4 @@ stages:
|
|
| 20 |
metrics:
|
| 21 |
- reports/metrics.txt:
|
| 22 |
cache: false
|
| 23 |
-
|
| 24 |
-
cmd: python src/data/make_dataset.py
|
| 25 |
-
deps:
|
| 26 |
-
- src/data/make_dataset.py
|
| 27 |
-
outs:
|
| 28 |
-
- data/processed/test.csv:
|
| 29 |
-
persist: true
|
| 30 |
-
- data/processed/train.csv:
|
| 31 |
-
persist: true
|
| 32 |
-
- data/processed/validation.csv:
|
| 33 |
-
persist: true
|
|
|
|
| 1 |
stages:
|
| 2 |
+
create_data:
|
| 3 |
+
cmd: src/data/make_dataset.py
|
| 4 |
+
deps:
|
| 5 |
+
- src/data/make_dataset.py
|
| 6 |
+
outs:
|
| 7 |
+
- data/raw:
|
| 8 |
+
persist: true
|
| 9 |
+
process_data:
|
| 10 |
+
cmd: python src/data/process_data.py
|
| 11 |
+
deps:
|
| 12 |
+
- src/data/process_data.py
|
| 13 |
+
outs:
|
| 14 |
+
- data/processed/test.csv:
|
| 15 |
+
persist: true
|
| 16 |
+
- data/processed/train.csv:
|
| 17 |
+
persist: true
|
| 18 |
+
- data/processed/validation.csv:
|
| 19 |
+
persist: true
|
| 20 |
train:
|
| 21 |
cmd: python src/models/train_model.py
|
| 22 |
deps:
|
|
|
|
| 38 |
metrics:
|
| 39 |
- reports/metrics.txt:
|
| 40 |
cache: false
|
| 41 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/data/process_data.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
|
|
|
|
| 3 |
def process_data(split='train'):
|
| 4 |
-
df= pd.DataFrame()
|
| 5 |
dataset = pd.load_csv('summarization/data/raw/{}.csv'.format(split))
|
| 6 |
df['article'] = dataset['article']
|
| 7 |
df['highlights'] = dataset['highlights']
|
| 8 |
-
df.to_csv('summarization/data/processed/{}.csv'.format(split))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
+
|
| 4 |
def process_data(split='train'):
|
| 5 |
+
df = pd.DataFrame()
|
| 6 |
dataset = pd.load_csv('summarization/data/raw/{}.csv'.format(split))
|
| 7 |
df['article'] = dataset['article']
|
| 8 |
df['highlights'] = dataset['highlights']
|
| 9 |
+
df.to_csv('summarization/data/processed/{}.csv'.format(split))
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
if __name__ == '__name__':
|
| 13 |
+
process_data(split='train')
|
| 14 |
+
process_data(split='test')
|
| 15 |
+
process_data(split='validation')
|