Spaces:

DataQuests
/

DeepCritical

Running

App Files Files Community

DeepCritical / src /tools /biorxiv.py

VibecoderMcSwaggins

feat: Phase 11 bioRxiv Preprint Integration (#19)

e67c99f unverified 13 days ago

raw

history blame

5.07 kB

	"""bioRxiv/medRxiv preprint search tool."""

	import re
	from datetime import datetime, timedelta
	from typing import Any

	import httpx
	from tenacity import retry, stop_after_attempt, wait_exponential

	from src.utils.exceptions import SearchError
	from src.utils.models import Citation, Evidence


	class BioRxivTool:
	"""Search tool for bioRxiv and medRxiv preprints."""

	BASE_URL = "https://api.biorxiv.org/details"
	# Use medRxiv for medical/clinical content (more relevant for drug repurposing)
	DEFAULT_SERVER = "medrxiv"
	# Fetch papers from last N days
	DEFAULT_DAYS = 90

	def __init__(self, server: str = DEFAULT_SERVER, days: int = DEFAULT_DAYS) -> None:
	"""
	Initialize bioRxiv tool.

	Args:
	server: "biorxiv" or "medrxiv"
	days: How many days back to search
	"""
	self.server = server
	self.days = days

	@property
	def name(self) -> str:
	return "biorxiv"

	@retry(
	stop=stop_after_attempt(3),
	wait=wait_exponential(multiplier=1, min=1, max=10),
	reraise=True,
	)
	async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
	"""
	Search bioRxiv/medRxiv for preprints matching query.

	Note: bioRxiv API doesn't support keyword search directly.
	We fetch recent papers and filter client-side.

	Args:
	query: Search query (keywords)
	max_results: Maximum results to return

	Returns:
	List of Evidence objects from preprints
	"""
	# Build date range for last N days
	end_date = datetime.now().strftime("%Y-%m-%d")
	start_date = (datetime.now() - timedelta(days=self.days)).strftime("%Y-%m-%d")
	interval = f"{start_date}/{end_date}"

	# Fetch recent papers
	url = f"{self.BASE_URL}/{self.server}/{interval}/0/json"

	async with httpx.AsyncClient(timeout=30.0) as client:
	try:
	response = await client.get(url)
	response.raise_for_status()
	except httpx.HTTPStatusError as e:
	raise SearchError(f"bioRxiv search failed: {e}") from e
	except httpx.RequestError as e:
	raise SearchError(f"bioRxiv connection failed: {e}") from e

	data = response.json()
	papers = data.get("collection", [])

	# Filter papers by query keywords
	query_terms = self._extract_terms(query)
	matching = self._filter_by_keywords(papers, query_terms, max_results)

	return [self._paper_to_evidence(paper) for paper in matching]

	def _extract_terms(self, query: str) -> list[str]:
	"""Extract search terms from query."""
	# Simple tokenization, lowercase
	terms = re.findall(r"\b\w+\b", query.lower())
	# Filter out common stop words
	stop_words = {"the", "a", "an", "in", "on", "for", "and", "or", "of", "to"}
	return [t for t in terms if t not in stop_words and len(t) > 2]

	def _filter_by_keywords(
	self, papers: list[dict[str, Any]], terms: list[str], max_results: int
	) -> list[dict[str, Any]]:
	"""Filter papers that contain query terms in title or abstract."""
	scored_papers = []

	for paper in papers:
	title = paper.get("title", "").lower()
	abstract = paper.get("abstract", "").lower()
	text = f"{title} {abstract}"

	# Count matching terms
	matches = sum(1 for term in terms if term in text)

	if matches > 0:
	scored_papers.append((matches, paper))

	# Sort by match count (descending)
	scored_papers.sort(key=lambda x: x[0], reverse=True)

	return [paper for _, paper in scored_papers[:max_results]]

	def _paper_to_evidence(self, paper: dict[str, Any]) -> Evidence:
	"""Convert a preprint paper to Evidence."""
	doi = paper.get("doi", "")
	title = paper.get("title", "Untitled")
	authors_str = paper.get("authors", "Unknown")
	date = paper.get("date", "Unknown")
	abstract = paper.get("abstract", "No abstract available.")
	category = paper.get("category", "")

	# Parse authors (format: "Smith, J; Jones, A")
	authors = [a.strip() for a in authors_str.split(";")][:5]

	# Truncate abstract if needed
	truncated_abstract = abstract[:1800]
	suffix = "..." if len(abstract) > 1800 else ""

	# Note this is a preprint in the content
	content = (
	f"[PREPRINT - Not peer-reviewed] {truncated_abstract}{suffix} Category: {category}."
	)

	return Evidence(
	content=content[:2000],
	citation=Citation(
	source="biorxiv",
	title=title[:500],
	url=f"https://doi.org/{doi}" if doi else "https://www.medrxiv.org/",
	date=date,
	authors=authors,
	),
	relevance=0.75, # Slightly lower than peer-reviewed
	)