Spaces:
Sleeping
Sleeping
| import json | |
| import string | |
| import numpy as np | |
| import pandas as pd | |
| from tqdm import tqdm | |
| import torch | |
| import spacy | |
| from transformers import AutoTokenizer, AutoModel | |
| from adapters import AutoAdapterModel | |
| def restore_inverted_abstract(inverted_abstr): | |
| all_indexes = [index for indexes in inverted_abstr.values() for index in indexes] | |
| if len(all_indexes) > 0: | |
| length = max(all_indexes) + 1 | |
| else: | |
| return None | |
| abstract_words = ["" for _ in range(length)] | |
| for word, indexes in inverted_abstr.items(): | |
| for index in indexes: | |
| abstract_words[index] = word | |
| return " ".join(abstract_words) | |
| def extract_title_abstract(oa_object): | |
| abstract = oa_object["abstract_inverted_index"] | |
| title_abstract_obj = { | |
| "title": oa_object["title"], | |
| "abstract": (None if abstract is None else restore_inverted_abstract(abstract)) | |
| } | |
| return title_abstract_obj | |
| def preprocess_batch(batch, tokenizer, input_is_context=False): | |
| # papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'}, | |
| # {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}] | |
| # concatenate title and abstract | |
| if not input_is_context: | |
| batch = [(d['title'] or '') + tokenizer.sep_token + (d.get('abstract') or '') for d in batch] | |
| tokenized_batch = tokenizer(batch, padding=True, truncation=True, | |
| return_tensors="pt", return_token_type_ids=False, max_length=512) | |
| return tokenized_batch | |
| def sent_is_mostly_known_tokens(tokens, tokenizer, threshold=0.7): | |
| return get_fraction_of_known_tokens(tokens, tokenizer) >= threshold | |
| def get_fraction_of_known_tokens(tokens, tokenizer): | |
| total_tokens = len(tokens) | |
| if total_tokens == 0: | |
| return False # Avoid division by zero | |
| # Clean tokens and check if they exist in the tokenizer's vocab | |
| known_tokens = sum(1 for token in tokens if token.text.lower().strip(string.punctuation) in tokenizer.vocab) | |
| return known_tokens / total_tokens | |
| def prune_contexts(contexts, spacy_model, tokenizer): | |
| chosen_sents = [] | |
| fractions = [] | |
| for _, context in tqdm(contexts.iterrows(), total=len(contexts)): | |
| text = (context["left_context"] + context["mention"] + context["right_context"]).replace("\n", " ") | |
| citation_start = len(context["left_context"]) + 1 | |
| spacied = spacy_model(text) | |
| chosen_sent = None | |
| previous_sent = "" | |
| kt_fraction = None | |
| for sent in spacied.sents: | |
| if citation_start < sent.end_char and citation_start >= sent.start_char: | |
| chosen_sent = previous_sent + sent.text | |
| kt_fraction = get_fraction_of_known_tokens(sent, tokenizer) | |
| break | |
| previous_sent = sent.text | |
| if chosen_sent is None or len(chosen_sent.split()) < 5: | |
| print(f" - no context found: {spacied.text}") | |
| chosen_sent = None | |
| # if chosen_sent is not None: | |
| chosen_sents.append(chosen_sent) | |
| fractions.append(kt_fraction) | |
| return chosen_sents, fractions | |
| def embed_contexts(contexts, model, tokenizer, batch_size = 16): | |
| embeddings = [] | |
| # Process in batches | |
| with torch.no_grad(): # Disable gradient tracking to save memory | |
| for i in tqdm(range(0, len(contexts), batch_size)): | |
| batch = contexts[i:i + batch_size] | |
| try: | |
| inputs = preprocess_batch(batch, tokenizer, input_is_context=True) | |
| except Exception as e: | |
| print(e) | |
| breakpoint() | |
| batch_embeddings = embed_batch(inputs, model) | |
| embeddings.append(batch_embeddings) | |
| # Concatenate all batches back together | |
| return torch.cat(embeddings, dim=0) | |
| def embed_batch(tokenized_batch, model): | |
| output = model(**tokenized_batch) | |
| # take the first token in the batch as the embedding | |
| embeddings = output.last_hidden_state[:, 0, :] | |
| return embeddings | |
| def embed_abstracts(abstract_title_list, model, tokenizer, batch_size=16): | |
| print("Loaded specter2 model:") | |
| embeddings = [] | |
| # Process in batches | |
| with torch.no_grad(): # Disable gradient tracking to save memory | |
| for i in tqdm(range(0, len(abstract_title_list), batch_size)): | |
| batch = abstract_title_list[i:i + batch_size] | |
| inputs = preprocess_batch(batch, tokenizer) | |
| batch_embeddings = embed_batch(inputs, model) | |
| embeddings.append(batch_embeddings) | |
| # Concatenate all batches back together | |
| return torch.cat(embeddings, dim=0) | |
| def calculate_distances(embeddings_a, embeddings_b, indices, batch_size=512): | |
| # Initialize a list to store the results | |
| all_distances = [None] * len(indices) | |
| # Loop over the embeddings in batches | |
| num_batches = len(indices) // batch_size + (1 if len(indices) % batch_size != 0 else 0) | |
| for i in range(num_batches): | |
| # Get the current batch | |
| start_idx = i * batch_size | |
| end_idx = min((i + 1) * batch_size, len(indices)) | |
| batch_a, batch_b, batch_positions = [], [], [] | |
| for idx, (a, b) in enumerate(indices[start_idx:end_idx]): | |
| if a is None or b is None: | |
| all_distances[start_idx + idx] = np.nan # Assign NaN directly in place | |
| else: | |
| batch_a.append(embeddings_a[a]) | |
| batch_b.append(embeddings_b[b]) | |
| batch_positions.append(start_idx + idx) | |
| if batch_a and batch_b: | |
| batch_a = torch.from_numpy(np.array(batch_a)).float() | |
| batch_b = torch.from_numpy(np.array(batch_b)).float() | |
| # Compute L2 (Euclidean) distance for the batch | |
| distances_batch = torch.norm(batch_a - batch_b, p=2, dim=1).numpy().astype(float) | |
| # Assign computed distances in the correct positions | |
| for pos, dist in zip(batch_positions, distances_batch): | |
| all_distances[pos] = dist | |
| return all_distances | |
| def add_distances_to_df(df, index_left, index_right, embeddings, column_name): | |
| if column_name == "abstract_abstract_l2_distance": | |
| indices = [(index_left.index(doi), index_right.index(cite_id)) for doi, cite_id in zip(df["cited_in_doi"], df["citation_id"])] | |
| print("calculate distances...") | |
| distances = calculate_distances(embeddings["original_abstract"], embeddings["citation_abstract"], indices) | |
| df[column_name] = distances | |
| elif column_name == "context_abstract_l2_distance": | |
| indices = [ | |
| (index_left.index(i), index_right.index(cite_id)) | |
| if i in index_left else (None, None) | |
| for i, cite_id in enumerate(df["citation_id"]) | |
| ] | |
| print("calculate distances...") | |
| distances = calculate_distances(embeddings["citation_context_base"], embeddings["citation_abstract"], indices) | |
| df[column_name] = distances | |
| return df | |
| def add_pruned_contexts_to_df(df, df_name): | |
| tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base') | |
| nlp = spacy.load("en_core_web_lg") | |
| df["pruned_contexts"], df["known_tokens_fraction"] = prune_contexts(df, nlp, tokenizer) | |
| df.to_parquet(df_name, compression='gzip') | |
| def main_specter(retracted, overwrite=True): | |
| tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base') | |
| model = AutoAdapterModel.from_pretrained('allenai/specter2_base') | |
| # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True) | |
| if not overwrite: | |
| embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz') | |
| ## Paper abstracts | |
| if retracted: | |
| data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip") | |
| else: | |
| # data = pd.read_parquet("24_11_30_reference_articles.gzip") | |
| data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip") | |
| print("embedding original abstracts...") | |
| if not overwrite and "original_abstract" in embeddings_from_disk: | |
| paper_abstract_embedding = embeddings_from_disk["original_abstract"] | |
| else: | |
| paper_abstract_embedding = embed_abstracts( | |
| [ | |
| {"title":r["Title"], "abstract": r["Abstract"]} | |
| for _,r in data.iterrows() | |
| ], | |
| model, | |
| tokenizer, | |
| batch_size=4 | |
| ).detach().numpy() | |
| ## Cited papers abstracts | |
| if retracted: | |
| citations_df_name = "retraction_citation_mentions.gzip" | |
| with open("retractions_citations.json") as jsonfile: | |
| cite_data = json.load(jsonfile) | |
| citations = pd.read_parquet(citations_df_name) | |
| else: | |
| citations_df_name = "reference_mc_citation_mentions.gzip" | |
| # with open("reference_citations.json") as jsonfile: | |
| with open("reference_most_cited_citations.json") as jsonfile: | |
| cite_data = json.load(jsonfile) | |
| citations = pd.read_parquet(citations_df_name) | |
| cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite} | |
| print("embedding cited abstracts...") | |
| if not overwrite and "citation_abstract" in embeddings_from_disk: | |
| citation_abstract_embedding = embeddings_from_disk["citation_abstract"] | |
| else: | |
| citation_abstract_embedding = embed_abstracts( | |
| [ | |
| { | |
| "title":cite_data[cite]["title"], | |
| "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None) | |
| } | |
| for cite in citations["citation_id"].unique() | |
| ], | |
| model, | |
| tokenizer, | |
| batch_size=4, | |
| ).detach().numpy() | |
| print("embedding citation contexts base...") | |
| if not overwrite and "citation_context_base" in embeddings_from_disk: | |
| citation_context_embedding_base = embeddings_from_disk["citation_context_base"] | |
| else: | |
| citation_context_embedding_base = embed_contexts( | |
| citations[ | |
| (citations["known_tokens_fraction"] >= 0.7) & | |
| (~citations["pruned_contexts"].isna()) | |
| ]["pruned_contexts"].to_list(), | |
| model, | |
| tokenizer, | |
| ).detach().numpy() | |
| print("embedding citation contexts...") | |
| if not overwrite and "citation_context" in embeddings_from_disk: | |
| citation_context_embedding = embeddings_from_disk["citation_context"] | |
| else: | |
| model.load_adapter("allenai/specter2_adhoc_query", source="hf", load_as="adhoc", set_active=True) | |
| citation_context_embedding = embed_contexts( | |
| citations[ | |
| (citations["known_tokens_fraction"] >= 0.7) & | |
| (~citations["pruned_contexts"].isna()) | |
| ]["pruned_contexts"].to_list(), | |
| model, | |
| tokenizer, | |
| ).detach().numpy() | |
| # Save | |
| np.savez( | |
| f'{("retractions" if retracted else "reference")}_embeddings_specter.npz', | |
| original_abstract=paper_abstract_embedding, | |
| citation_context=citation_context_embedding, | |
| citation_abstract=citation_abstract_embedding, | |
| citation_context_base=citation_context_embedding_base, | |
| ) | |
| # Load | |
| data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz') | |
| print(data["original_abstract"].shape) # (768,) or (1536,) depending on the model | |
| print(data["citation_context"].shape) # (768,) or (1536,) depending on the model | |
| print(data["citation_context_base"].shape) # (768,) or (1536,) depending on the model | |
| print(data["citation_abstract"].shape) # (768,) or (1536,) depending on the model | |
| def main_scibert(retracted, overwrite=True): | |
| tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') | |
| model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') | |
| # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True) | |
| if not overwrite: | |
| embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz') | |
| ## Paper abstracts | |
| if retracted: | |
| data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip") | |
| else: | |
| # data = pd.read_parquet("24_11_30_reference_articles.gzip") | |
| data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip") | |
| print("embedding original abstracts...") | |
| if not overwrite and "original_abstract" in embeddings_from_disk: | |
| paper_abstract_embedding = embeddings_from_disk["original_abstract"] | |
| else: | |
| paper_abstract_embedding = embed_abstracts( | |
| [ | |
| {"title":r["Title"], "abstract": r["Abstract"]} | |
| for _,r in data.iterrows() | |
| ], | |
| model, | |
| tokenizer, | |
| batch_size=4 | |
| ).detach().numpy() | |
| ## Cited papers abstracts | |
| if retracted: | |
| citations_df_name = "retraction_citation_mentions.gzip" | |
| with open("retractions_citations.json") as jsonfile: | |
| cite_data = json.load(jsonfile) | |
| citations = pd.read_parquet(citations_df_name) | |
| else: | |
| citations_df_name = "reference_mc_citation_mentions.gzip" | |
| # with open("reference_citations.json") as jsonfile: | |
| with open("reference_most_cited_citations.json") as jsonfile: | |
| cite_data = json.load(jsonfile) | |
| citations = pd.read_parquet(citations_df_name) | |
| cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite} | |
| print("embedding cited abstracts...") | |
| if not overwrite and "citation_abstract" in embeddings_from_disk: | |
| citation_abstract_embedding = embeddings_from_disk["citation_abstract"] | |
| else: | |
| citation_abstract_embedding = embed_abstracts( | |
| [ | |
| { | |
| "title":cite_data[cite]["title"], | |
| "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None) | |
| } | |
| for cite in citations["citation_id"].unique() | |
| ], | |
| model, | |
| tokenizer, | |
| batch_size=4, | |
| ).detach().numpy() | |
| print("embedding citation contexts...") | |
| if not overwrite and "citation_context" in embeddings_from_disk: | |
| citation_context_embedding = embeddings_from_disk["citation_context"] | |
| else: | |
| citation_context_embedding = embed_contexts( | |
| citations[ | |
| (citations["known_tokens_fraction"] >= 0.7) & | |
| (~citations["pruned_contexts"].isna()) | |
| ]["pruned_contexts"].to_list(), | |
| model, | |
| tokenizer, | |
| ).detach().numpy() | |
| # Save | |
| np.savez( | |
| f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz', | |
| original_abstract=paper_abstract_embedding, | |
| citation_context=citation_context_embedding, | |
| citation_abstract=citation_abstract_embedding, | |
| ) | |
| # Load | |
| data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz') | |
| print(data["original_abstract"].shape) # (768,) or (1536,) depending on the model | |
| print(data["citation_context"].shape) # (768,) or (1536,) depending on the model | |
| print(data["citation_context_base"].shape) # (768,) or (1536,) depending on the model | |
| print(data["citation_abstract"].shape) # (768,) or (1536,) depending on the model | |
| if __name__=="__main__": | |
| import sys | |
| retracted=(sys.argv[1] == "retracted") | |
| if retracted: | |
| print("Running embedding pipeline for retractions.") | |
| else: | |
| print("Running embedding pipeline for reference.") | |
| df = pd.read_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip') | |
| # add_pruned_contexts_to_df(df, f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip') | |
| main_scibert(retracted, overwrite=False) | |
| # main_specter(retracted, overwrite=False) | |
| embeddings = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz') | |
| print(embeddings["original_abstract"].shape) # (768,) or (1536,) depending on the model | |
| print(embeddings["citation_context"].shape) # (768,) or (1536,) depending on the model | |
| print(embeddings["citation_abstract"].shape) # (768,) or (1536,) depending on the model | |
| # original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_11_30_reference_articles")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist() | |
| original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_12_31_reference_articles_most_cited")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist() | |
| # df = add_distances_to_df( | |
| # df, | |
| # [doi.replace("https://doi.org/", "") for doi in original_dois], | |
| # df["citation_id"].unique().tolist(), | |
| # embeddings, | |
| # "abstract_abstract_l2_distance" | |
| # ) | |
| df = add_distances_to_df( | |
| df, | |
| df.index[ | |
| (df["known_tokens_fraction"] >= 0.7) & | |
| (~df["pruned_contexts"].isna()) | |
| ].tolist(), | |
| df["citation_id"].unique().tolist(), | |
| embeddings, | |
| "context_abstract_l2_distance" | |
| ) | |
| df.to_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip', compression='gzip') | |