Spaces:
Runtime error
Runtime error
| from keybert import KeyBERT | |
| import streamlit as st | |
| import streamlit.components.v1 as components | |
| from datasets import load_dataset | |
| import pandas as pd | |
| st.set_page_config(page_title="KeyBERT") | |
| st.title("HF-KeyBERT A front end for KeyBERT") | |
| st.caption("By Allen Roush") | |
| st.caption("github: https://github.com/Hellisotherpeople") | |
| st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/") | |
| st.header("KeyBERT") | |
| st.caption("By Maarten Grootendorst") | |
| st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200) | |
| st.caption("github: https://github.com/MaartenGr") | |
| st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/") | |
| form = st.sidebar.form("choose_settings") | |
| form.header("Main Settings") | |
| custom_doc = form.checkbox("Use a document from an existing dataset?", value = True) | |
| if custom_doc: | |
| dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum") | |
| dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "") | |
| split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train") | |
| number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200) | |
| column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document") | |
| index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0) | |
| index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2) | |
| else: | |
| doc = st.text_area("Enter a custom document") | |
| model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2") | |
| form.caption("This will download a new model, so it may take awhile or even break if the model is too large") | |
| form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html") | |
| form.form_submit_button("Submit") | |
| def load_and_process_data(path, name, streaming, split_name, number_of_records): | |
| dataset = load_dataset(path = path, name = name, streaming=streaming) | |
| #return list(dataset) | |
| dataset_head = dataset[split_name].take(number_of_records) | |
| df = pd.DataFrame.from_dict(dataset_head) | |
| return df[column_name] | |
| def load_model(model_name): | |
| kw_model = KeyBERT(model=model_name) | |
| return kw_model | |
| model = load_model(model_name=model_name) | |
| if custom_doc: | |
| st.header("Original Dataset") | |
| df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records) | |
| doc = list(df[index_to_analyze_start:index_to_analyze_end]) | |
| st.write(df) | |
| st.header("Indexed Documents") | |
| st.write(doc) | |
| form2 = st.sidebar.form("KeyBERT Settings") | |
| form2.header("KeyBERT Settings") | |
| keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1) | |
| keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1) | |
| form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases") | |
| use_maxsum = form2.checkbox("Use Max Sum Similarity?", value = False) | |
| form2.caption("Max sum modifies the keyphrase algorithm in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.") | |
| nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10) | |
| form2.caption("Only meaningful if Max Sum Similarity is selected") | |
| use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False) | |
| form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithm in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases") | |
| diversity = form2.number_input("Enter the diversity", value = 0.7) | |
| form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithm weighs the results") | |
| top_n = form2.number_input("Enter the number of returned keyphrases", value = 10) | |
| min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc)) | |
| form2.caption("Only meaningful if extracting the keyphrases of multiple documents") | |
| seed_keywords = form2.text_area("Enter a list of keyword (separated with space) which will personalize/guide the extracted keywords", value = "") | |
| form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results") | |
| form2.form_submit_button("Submit") | |
| keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split()) | |
| st.header("Extracted Keywords/Keyphrases") | |
| st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest)") | |
| st.caption("That means you should read from the bottom up") | |
| st.write(keywords) | |