[#1] fetch_alpha implemented
Browse files- explore/explore_fetch_alpha.py +10 -0
- explore/explore_fetch_alpha_predict.py +19 -0
- explore/explore_fetch_epie.py +0 -27
- idiomify/fetchers.py +17 -2
- idiomify/paths.py +0 -4
explore/explore_fetch_alpha.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from idiomify.fetchers import fetch_alpha
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def main():
|
| 5 |
+
model = fetch_alpha("overfit")
|
| 6 |
+
print(model.bart.config)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
if __name__ == '__main__':
|
| 10 |
+
main()
|
explore/explore_fetch_alpha_predict.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import BartTokenizer
|
| 2 |
+
from builders import SourcesBuilder
|
| 3 |
+
from fetchers import fetch_alpha
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
model = fetch_alpha("overfit")
|
| 8 |
+
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
|
| 9 |
+
lit2idi = [
|
| 10 |
+
("my man", ""),
|
| 11 |
+
("hello", "")
|
| 12 |
+
] # just some dummy stuff
|
| 13 |
+
srcs = SourcesBuilder(tokenizer)(lit2idi)
|
| 14 |
+
out = model.predict(srcs=srcs)
|
| 15 |
+
print(out)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if __name__ == '__main__':
|
| 19 |
+
main()
|
explore/explore_fetch_epie.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
from idiomify.fetchers import fetch_epie
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
def main():
|
| 6 |
-
epie = fetch_epie()
|
| 7 |
-
idioms = set([
|
| 8 |
-
idiom
|
| 9 |
-
for idiom, _, _ in epie
|
| 10 |
-
])
|
| 11 |
-
|
| 12 |
-
# so, what do you want? you want to build an idiom-masked language modeling?
|
| 13 |
-
for idiom, context, tag in epie:
|
| 14 |
-
print(idiom, context)
|
| 15 |
-
|
| 16 |
-
for idx, idiom in enumerate(idioms):
|
| 17 |
-
print(idx, idiom)
|
| 18 |
-
|
| 19 |
-
# isn't it better to just leave the idiom there, and have it guess what meaning it has?
|
| 20 |
-
# in that case, It may be better to use a generative model?
|
| 21 |
-
# but what would happen if you let it... just guess it?
|
| 22 |
-
# the problem with non-masking is that ... you give the model the answer.
|
| 23 |
-
# what you should rather do is... do something like... find similar words.
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
if __name__ == '__main__':
|
| 27 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
idiomify/fetchers.py
CHANGED
|
@@ -5,7 +5,7 @@ import wandb
|
|
| 5 |
import requests
|
| 6 |
from typing import Tuple, List
|
| 7 |
from wandb.sdk.wandb_run import Run
|
| 8 |
-
from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic
|
| 9 |
from idiomify.urls import (
|
| 10 |
EPIE_IMMUTABLE_IDIOMS_URL,
|
| 11 |
EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
|
|
@@ -15,9 +15,10 @@ from idiomify.urls import (
|
|
| 15 |
EPIE_MUTABLE_IDIOMS_TAGS_URL,
|
| 16 |
PIE_URL
|
| 17 |
)
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
-
# sources for dataset
|
| 21 |
def fetch_epie(ver: str) -> List[Tuple[str, str, str]]:
|
| 22 |
"""
|
| 23 |
It fetches the EPIE idioms, contexts, and tags from the web
|
|
@@ -85,6 +86,20 @@ def fetch_literal2idiomatic(ver: str, run: Run = None) -> List[Tuple[str, str]]:
|
|
| 85 |
return [(row[0], row[1]) for row in reader]
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
def fetch_config() -> dict:
|
| 89 |
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
| 90 |
return yaml.safe_load(fh)
|
|
|
|
| 5 |
import requests
|
| 6 |
from typing import Tuple, List
|
| 7 |
from wandb.sdk.wandb_run import Run
|
| 8 |
+
from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, alpha_dir
|
| 9 |
from idiomify.urls import (
|
| 10 |
EPIE_IMMUTABLE_IDIOMS_URL,
|
| 11 |
EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
|
|
|
|
| 15 |
EPIE_MUTABLE_IDIOMS_TAGS_URL,
|
| 16 |
PIE_URL
|
| 17 |
)
|
| 18 |
+
from transformers import AutoModelForSeq2SeqLM, AutoConfig
|
| 19 |
+
from models import Alpha
|
| 20 |
|
| 21 |
|
|
|
|
| 22 |
def fetch_epie(ver: str) -> List[Tuple[str, str, str]]:
|
| 23 |
"""
|
| 24 |
It fetches the EPIE idioms, contexts, and tags from the web
|
|
|
|
| 86 |
return [(row[0], row[1]) for row in reader]
|
| 87 |
|
| 88 |
|
| 89 |
+
def fetch_alpha(ver: str, run: Run = None) -> Alpha:
|
| 90 |
+
if run:
|
| 91 |
+
artifact = run.use_artifact(f"alpha:{ver}", type="model")
|
| 92 |
+
else:
|
| 93 |
+
artifact = wandb.Api().artifact(f"eubinecto/idiomify/alpha:{ver}", type="model")
|
| 94 |
+
config = artifact.metadata
|
| 95 |
+
artifact_dir = artifact.download(root=alpha_dir(ver))
|
| 96 |
+
ckpt_path = path.join(artifact_dir, "model.ckpt")
|
| 97 |
+
bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
|
| 98 |
+
with open(ckpt_path, 'r') as fh:
|
| 99 |
+
alpha = Alpha.load_from_checkpoint(ckpt_path, bart=bart)
|
| 100 |
+
return alpha
|
| 101 |
+
|
| 102 |
+
|
| 103 |
def fetch_config() -> dict:
|
| 104 |
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
| 105 |
return yaml.safe_load(fh)
|
idiomify/paths.py
CHANGED
|
@@ -15,7 +15,3 @@ def literal2idiomatic(ver: str) -> Path:
|
|
| 15 |
|
| 16 |
def alpha_dir(ver: str) -> Path:
|
| 17 |
return ARTIFACTS_DIR / f"alpha_{ver}"
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def gamma_dir(ver: str) -> Path:
|
| 21 |
-
return ARTIFACTS_DIR / f"beta_{ver}"
|
|
|
|
| 15 |
|
| 16 |
def alpha_dir(ver: str) -> Path:
|
| 17 |
return ARTIFACTS_DIR / f"alpha_{ver}"
|
|
|
|
|
|
|
|
|
|
|
|