Spaces:

answerdotai
/

zotero-weekly

Sleeping

App Files Files Community

rbiswasfc commited on Sep 10, 2024

Commit

d2071ed

1 Parent(s): 75d46f7

cleanup

Browse files

Files changed (6) hide show

Dockerfile +4 -23
app.py +6 -1
main.py +0 -618
profile_app.py +0 -52
requirements.txt +1 -3
supervisord.conf +0 -20

Dockerfile CHANGED Viewed

@@ -1,32 +1,13 @@
 FROM python:3.10
-RUN useradd -m -u 1000 user
-USER user
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH
-# Set the working directory
-WORKDIR $HOME/app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-RUN git config --global credential.helper store
-COPY . .
-COPY supervisord.conf .
-# Set permissions on the log file
-USER root
-RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
 RUN mkdir -p /tmp/cache/
-RUN mkdir -p /.cache
 RUN chmod a+rwx -R /tmp/cache/
-RUN chmod a+rwx -R /.cache
 ENV HF_HUB_CACHE=HF_HOME
 ENV PYTHONUNBUFFERED=1 PORT=7860
-CMD ["python", "app.py"]
-# # Run supervisord
-# CMD ["supervisord", "-c", "supervisord.conf"]

 FROM python:3.10
+WORKDIR /code
+COPY --link --chown=1000 . .
 RUN mkdir -p /tmp/cache/
 RUN chmod a+rwx -R /tmp/cache/
 ENV HF_HUB_CACHE=HF_HOME
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 ENV PYTHONUNBUFFERED=1 PORT=7860
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -159,4 +159,9 @@ def get(date: str):
         return Div(f"Error displaying articles: {str(e)}")
-serve()

         return Div(f"Error displaying articles: {str(e)}")
+# serve()
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))

main.py DELETED Viewed

@@ -1,618 +0,0 @@
-import os
-import re
-import shutil
-import time
-import dotenv
-import fitz  # PyMuPDF
-import pandas as pd
-import requests
-import schedule
-import srsly
-from bs4 import BeautifulSoup
-from datasets import Dataset, Image, concatenate_datasets, load_dataset
-from huggingface_hub import create_repo, login, whoami
-from PIL import Image as PILImage
-from retry import retry
-from tqdm.auto import tqdm
-dotenv.load_dotenv()
-login(token=os.environ.get("HF_TOKEN"))
-hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
-HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
-HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
-########################################################
-### GET ZOTERO ITEMS
-########################################################
-@retry(tries=3, delay=8)
-def _fetch_one_zotero_batch(url, headers, params):
-    """
-    Fetch articles from Zotero API
-    """
-    response = requests.get(url, headers=headers, params=params)
-    response.raise_for_status()
-    return response.json()
-def get_zotero_items(debug=False):
-    """
-    fetch items from zotero library
-    """
-    GROUP_ID = os.getenv("GROUP_ID")
-    API_KEY = os.getenv("API_KEY")
-    BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
-    LIMIT = 100
-    headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}
-    items = []
-    start = 0
-    i = 1
-    while True:
-        i += 1
-        params = {"limit": LIMIT, "start": start}
-        page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)
-        if not page_items:
-            break
-        items.extend(page_items)
-        start += LIMIT
-        print(f"# items fetched {len(items)}")
-        if debug:
-            if len(items) > 1600:
-                break
-    return items
-########################################################
-### EXTRACT ARXIV LINKS AND PDFs
-########################################################
-def get_arxiv_items(items):
-    visited = set()
-    arxiv_items = []
-    arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")
-    for item in items:
-        data = item.get("data", {})
-        attachments = item.get("links", {}).get("attachment", {})
-        arxiv_url = None
-        pdf_url = None
-        if "url" in data and "arxiv.org" in data["url"]:
-            arxiv_match = arxiv_pattern.search(data["url"])
-            if arxiv_match:
-                arxiv_url = data["url"]
-        if attachments:
-            pdf_url = attachments["href"]
-        if arxiv_url:
-            arxiv_id = arxiv_url.split("/")[-1]
-            if arxiv_id in visited:
-                continue
-            authors = []
-            for author in data.get("creators", []):
-                authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
-            arxiv_items.append(
-                {
-                    "arxiv_id": arxiv_id,
-                    "arxiv_url": arxiv_url,
-                    "title": data.get("title", ""),
-                    "authors": authors,
-                    "pdf_url": pdf_url,
-                    "date_published": data.get("date", ""),
-                    "added_by": item["meta"]["createdByUser"]["username"],
-                    "date_added": data.get("dateAdded", ""),
-                }
-            )
-            visited.add(arxiv_id)
-    return arxiv_items
-@retry(tries=3, delay=15, backoff=2)
-def fetch_arxiv_html(arxiv_id):
-    url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
-    response = requests.get(url)
-    return response.text if response.status_code == 200 else None
-def fetch_arxiv_htmls(arxiv_items):
-    for item in tqdm(arxiv_items):
-        html = fetch_arxiv_html(item["arxiv_id"])
-        if html:
-            item["raw_content"] = html
-        else:
-            print(f"failed to fetch html for {item['arxiv_id']}")
-            item["raw_content"] = "Error"
-    return arxiv_items
-########################################################
-### PARSE CONTENT FROM ARXIV HTML #
-########################################################
-def parse_html_content(html):
-    """
-    Parse content from arxiv html
-    """
-    arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
-    arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
-    soup = BeautifulSoup(html, "html.parser")
-    result = []
-    # Extract paper title
-    try:
-        paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
-    except Exception:
-        paper_title = soup.find("title").get_text(strip=True)
-        paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
-    for math in soup.find_all("math"):
-        math.decompose()
-    for cite in soup.find_all("cite"):
-        cite.decompose()
-    # Extract abstract
-    abstract = soup.find("div", class_="ltx_abstract")
-    if abstract:
-        result.append(
-            {
-                "content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
-                "title": "Abstract",
-                "paper_title": paper_title,
-                "content_type": "abstract",
-            }
-        )
-    # Extract sections
-    sections = soup.find_all("section", class_="ltx_section")
-    for index, section in enumerate(sections):
-        section_title = section.find("h2", class_="ltx_title ltx_title_section")
-        section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
-        section_content = section.get_text(strip=True).replace(")", ") ")
-        content_type = "body"
-        if index == 0:
-            content_type = "introduction"
-        elif index == len(sections) - 1:
-            content_type = "conclusion"
-        result.append(
-            {
-                "content": section_content,
-                "title": section_title,
-                "paper_title": paper_title,
-                "content_type": content_type,
-            }
-        )
-    for c in result:
-        c["arxiv_id"] = arxiv_id
-    return result
-########################################################
-### GET TEXTS FROM PDF & PARSE
-########################################################
-def get_pdf_text(arxiv_id):
-    url = "http://147.189.194.113:80/extract"  # fix: currently down
-    try:
-        response = requests.get(url, params={"arxiv_id": arxiv_id})
-        response = response.json()
-        if "text" in response:
-            return response["text"]
-        return None
-    except Exception as e:
-        print(e)
-        return None
-def get_content_type(section_type, section_count):
-    """Determine the content type based on the section type and count"""
-    if section_type == "abstract":
-        return "abstract"
-    elif section_type == "introduction" or section_count == 1:
-        return "introduction"
-    elif section_type == "conclusion" or section_type == "references":
-        return section_type
-    else:
-        return "body"
-def get_section_type(title):
-    """Determine the section type based on the title"""
-    title_lower = title.lower()
-    if "abstract" in title_lower:
-        return "abstract"
-    elif "introduction" in title_lower:
-        return "introduction"
-    elif "conclusion" in title_lower:
-        return "conclusion"
-    elif "reference" in title_lower:
-        return "references"
-    else:
-        return "body"
-def parse_markdown_content(md_content, arxiv_id):
-    """
-    Parses markdown content to identify and extract sections based on headers.
-    """
-    lines = md_content.split("\n")
-    parsed = []
-    current_section = None
-    content = []
-    paper_title = None
-    current_title = None
-    # identify sections based on headers
-    for line in lines:
-        if line.startswith("#"):
-            if paper_title is None:
-                paper_title = line.lstrip("#").strip()
-                continue
-            if content:
-                if current_title:
-                    parsed.append(
-                        {
-                            "content": " ".join(content),
-                            "title": current_title,
-                            "paper_title": paper_title,
-                            "content_type": get_content_type(current_section, len(parsed)),
-                            "arxiv_id": arxiv_id,
-                        }
-                    )
-                content = []
-            current_title = line.lstrip("#").lstrip("#").lstrip()
-            if "bit" not in current_title:
-                current_title = (
-                    current_title.lstrip("123456789")
-                    .lstrip()
-                    .lstrip(".")
-                    .lstrip()
-                    .lstrip("123456789")
-                    .lstrip()
-                    .lstrip(".")
-                    .lstrip()
-                )
-            current_section = get_section_type(current_title)
-        else:
-            content.append(line)
-    # Add the last section
-    if content and current_title:
-        parsed.append(
-            {
-                "content": " ".join(content).replace(")", ") "),
-                "title": current_title,
-                "paper_title": paper_title,
-                "content_type": get_content_type(current_section, len(parsed)),
-                "arxiv_id": arxiv_id,
-            }
-        )
-    return parsed
-########################################################
-### Image Dataset
-########################################################
-def download_arxiv_pdf(arxiv_id):
-    arxiv_id = arxiv_id.split("v")[0]
-    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.content
-    else:
-        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
-def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
-    # Create output folder if it doesn't exist
-    os.makedirs(output_folder, exist_ok=True)
-    # Open the PDF
-    doc = fitz.open(stream=pdf_content, filetype="pdf")
-    # Iterate through pages
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        # Convert page to image
-        pix = page.get_pixmap()
-        # Save image as JPEG
-        image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
-        pix.save(image_path)
-        # print(f"Saved {image_path}")
-        if page_num >= max_pages:
-            break
-    doc.close()
-def save_arxiv_article_images(arxiv_id):
-    output_folder = os.path.join("data", "arxiv_images", arxiv_id)
-    try:
-        pdf_content = download_arxiv_pdf(arxiv_id)
-        pdf_to_jpegs(pdf_content, output_folder)
-    except Exception as e:
-        print(f"An error occurred: {str(e)}")
-def create_hf_image_dataset(base_dir):
-    data = []
-    # Walk through the directory
-    for root, dirs, files in os.walk(base_dir):
-        for file in files:
-            if file.endswith(".jpg"):
-                # Extract arxiv_id from the path
-                arxiv_id = os.path.basename(root)
-                # Extract page number from the filename
-                match = re.search(r"page_(\d+)", file)
-                if match:
-                    page_number = int(match.group(1))
-                else:
-                    continue  # Skip if page number can't be extracted
-                # Full path to the image
-                image_path = os.path.join(root, file)
-                # Open the image to get its size
-                with PILImage.open(image_path) as img:
-                    width, height = img.size
-                # Add the data
-                data.append(
-                    {"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
-                )
-    # Create the dataset
-    dataset = Dataset.from_dict(
-        {
-            "image": [d["image"] for d in data],
-            "arxiv_id": [d["arxiv_id"] for d in data],
-            "page_number": [d["page_number"] for d in data],
-        }
-    )
-    # Cast the image column to Image
-    dataset = dataset.cast_column("image", Image())
-    return dataset
-########################################################
-### HF UPLOAD
-########################################################
-def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
-    # repo_id = HF_REPO_ID
-    create_repo(
-        repo_id=HF_REPO_ID_TXT,
-        token=os.environ.get("HF_TOKEN"),
-        private=True,
-        repo_type="dataset",
-        exist_ok=True,
-    )
-    create_repo(
-        repo_id=HF_REPO_ID_IMG,
-        token=os.environ.get("HF_TOKEN"),
-        private=True,
-        repo_type="dataset",
-        exist_ok=True,
-    )
-    # upload image dataset
-    try:
-        img_ds = create_hf_image_dataset("data/arxiv_images")
-        try:
-            old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
-            img_ds = concatenate_datasets([old_img_ds, img_ds])
-        except Exception as e:
-            print(e)
-        img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
-    except Exception as e:
-        print(e)
-    # upload first pages only
-    try:
-        img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
-        img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
-    except Exception as e:
-        print(e)
-    try:
-        # push id_to_abstract
-        abstract_ds = Dataset.from_pandas(abstract_df)
-        abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
-        # push arxiv_items
-        arxiv_ds = Dataset.from_pandas(contents_df)
-        arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
-        # push processed_arxiv_ids
-        processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
-        processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
-        processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
-    except Exception as e:
-        print(e)
-########################################################
-### MAIN
-########################################################
-def main():
-    # items = get_zotero_items(debug=True)
-    items = get_zotero_items(debug=False)
-    print(f"# of items fetched from zotero: {len(items)}")
-    arxiv_items = get_arxiv_items(items)
-    print(f"# of arxiv papers: {len(arxiv_items)}")
-    # get already processed arxiv ids from HF
-    try:
-        existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
-    except Exception as e:
-        print(e)
-        existing_arxiv_ids = []
-    existing_arxiv_ids = set(existing_arxiv_ids)
-    print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
-    # new arxiv items
-    arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
-    arxiv_items = fetch_arxiv_htmls(arxiv_items)
-    print(f"# of new arxiv items: {len(arxiv_items)}")
-    if len(arxiv_items) == 0:
-        print("No new arxiv items to process")
-        return
-    processed_arxiv_ids = set()
-    pbar = tqdm(range(len(arxiv_items)))
-    # remove "data" directory if it exists
-    if os.path.exists("data"):
-        try:
-            shutil.rmtree("data")
-        except Exception as e:
-            print(e)
-    for item in arxiv_items:
-        # download images --
-        save_arxiv_article_images(item["arxiv_id"])
-        # parse html
-        try:
-            item["contents"] = parse_html_content(item["raw_content"])
-        except Exception as e:
-            print(f"Failed to parse html for {item['arxiv_id']}: {e}")
-            item["contents"] = []
-        if len(item["contents"]) == 0:
-            print("Extracting from pdf...")
-            md_content = get_pdf_text(item["arxiv_id"])  # fix this
-            item["raw_content"] = md_content
-            if md_content:
-                item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
-            else:
-                item["contents"] = []
-        if len(item["contents"]) > 0:
-            processed_arxiv_ids.add(item["arxiv_id"])
-            if len(item["authors"]) == 0:
-                item["authors"] = []  # ["unknown"]
-                item["title"] = item["contents"][0]["paper_title"]
-        pbar.update(1)
-    pbar.close()
-    # save contents ---
-    processed_arxiv_ids = list(processed_arxiv_ids)
-    print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
-    # save abstracts ---
-    id_to_abstract = {}
-    for item in arxiv_items:
-        for entry in item["contents"]:
-            if entry["content_type"] == "abstract":
-                id_to_abstract[item["arxiv_id"]] = entry["content"]
-                break
-    print(f"# of abstracts: {len(id_to_abstract)}")
-    abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
-    print(abstract_df.head())
-    # add to existing dataset
-    try:
-        old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
-    except Exception as e:
-        print(e)
-        old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
-    print(old_abstract_df.head())
-    abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
-    abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
-    # contents
-    contents_df = pd.DataFrame(arxiv_items)
-    print(contents_df.head())
-    try:
-        old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
-    except Exception as e:
-        print(e)
-        old_contents_df = pd.DataFrame(columns=contents_df.columns)
-    if len(old_contents_df) > 0:
-        print(old_contents_df.sample().T)
-    contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
-    contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
-    # upload to hf
-    processed_arxiv_ids = list(set(processed_arxiv_ids + list(existing_arxiv_ids)))
-    upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
-    # save as local copy
-    os.makedirs("data", exist_ok=True)
-    abstract_df.to_parquet("data/abstracts.parquet")
-    contents_df.to_parquet("data/contents.parquet")
-    srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)
-def schedule_periodic_task():
-    """
-    Schedule the main task to run at the user-defined frequency
-    """
-    # main()  # run once initially
-    frequency = "daily"  # TODO: env
-    if frequency == "hourly":
-        print("Scheduling tasks to run every hour at the top of the hour")
-        schedule.every().hour.at(":00").do(main)
-    elif frequency == "daily":
-        start_time = "10:00"
-        print("Scheduling tasks to run every day at: {start_time} UTC+00")
-        schedule.every().day.at(start_time).do(main)
-    while True:
-        schedule.run_pending()
-        time.sleep(1)
-if __name__ == "__main__":
-    schedule_periodic_task()

profile_app.py DELETED Viewed

@@ -1,52 +0,0 @@
-# import cProfile
-# import multiprocessing
-# import time
-# import requests
-# from fasthtml.common import *
-# from app import serve, weeks
-# PORT = 7860  # Update this to match the port your app is using
-# def run_server():
-#     serve()  # This should start your FastHTML app
-# def make_requests():
-#     base_url = f"http://127.0.0.1:{PORT}"
-#     # Test home page
-#     try:
-#         requests.get(f"{base_url}/")
-#     except requests.exceptions.RequestException as e:
-#         print(f"Error accessing home page: {e}")
-#     n_weeks = 10
-#     for week in weeks[: min(n_weeks, len(weeks))]:
-#         try:
-#             requests.get(f"{base_url}/week/{week}")
-#         except requests.exceptions.RequestException as e:
-#             print(f"Error accessing week {week}: {e}")
-#     # Add more requests here to cover other parts of your application
-# def profile_app():
-#     server_process = multiprocessing.Process(target=run_server)
-#     server_process.start()
-#     # Wait for the server to start
-#     time.sleep(60)
-#     try:
-#         make_requests()
-#     finally:
-#         server_process.terminate()
-#         server_process.join()
-# if __name__ == "__main__":
-#     cProfile.run("profile_app()", "profile_output.prof")
-#     print("Profiling complete. Run 'snakeviz profile_output.prof' to view results.")

requirements.txt CHANGED Viewed

@@ -1,9 +1,7 @@
 fasthtml-hf>=0.1.1
-python-fasthtml>=0.0.8
 huggingface-hub>=0.20.0
 uvicorn>=0.29
-schedule==1.2.0
-supervisor==4.2.5
 requests
 srsly
 python-dotenv

 fasthtml-hf>=0.1.1
+python-fasthtml>=0.5.2
 huggingface-hub>=0.20.0
 uvicorn>=0.29
 requests
 srsly
 python-dotenv

supervisord.conf DELETED Viewed

@@ -1,20 +0,0 @@
-[supervisord]
-nodaemon=true
-# [program:main]
-# command=python main.py
-# stdout_logfile=/dev/stdout
-# stdout_logfile_maxbytes=0
-# stderr_logfile=/dev/stderr
-# stderr_logfile_maxbytes=0
-# autostart=true
-# autorestart=true
-[program:app]
-command=python app.py
-stdout_logfile=/dev/null
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
-autostart=true
-autorestart=true