"""bioRxiv/medRxiv preprint search tool.""" import re from datetime import datetime, timedelta from typing import Any import httpx from tenacity import retry, stop_after_attempt, wait_exponential from src.utils.exceptions import SearchError from src.utils.models import Citation, Evidence class BioRxivTool: """Search tool for bioRxiv and medRxiv preprints.""" BASE_URL = "https://api.biorxiv.org/details" # Use medRxiv for medical/clinical content (more relevant for drug repurposing) DEFAULT_SERVER = "medrxiv" # Fetch papers from last N days DEFAULT_DAYS = 90 def __init__(self, server: str = DEFAULT_SERVER, days: int = DEFAULT_DAYS) -> None: """ Initialize bioRxiv tool. Args: server: "biorxiv" or "medrxiv" days: How many days back to search """ self.server = server self.days = days @property def name(self) -> str: return "biorxiv" @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10), reraise=True, ) async def search(self, query: str, max_results: int = 10) -> list[Evidence]: """ Search bioRxiv/medRxiv for preprints matching query. Note: bioRxiv API doesn't support keyword search directly. We fetch recent papers and filter client-side. Args: query: Search query (keywords) max_results: Maximum results to return Returns: List of Evidence objects from preprints """ # Build date range for last N days end_date = datetime.now().strftime("%Y-%m-%d") start_date = (datetime.now() - timedelta(days=self.days)).strftime("%Y-%m-%d") interval = f"{start_date}/{end_date}" # Fetch recent papers url = f"{self.BASE_URL}/{self.server}/{interval}/0/json" async with httpx.AsyncClient(timeout=30.0) as client: try: response = await client.get(url) response.raise_for_status() except httpx.HTTPStatusError as e: raise SearchError(f"bioRxiv search failed: {e}") from e except httpx.RequestError as e: raise SearchError(f"bioRxiv connection failed: {e}") from e data = response.json() papers = data.get("collection", []) # Filter papers by query keywords query_terms = self._extract_terms(query) matching = self._filter_by_keywords(papers, query_terms, max_results) return [self._paper_to_evidence(paper) for paper in matching] def _extract_terms(self, query: str) -> list[str]: """Extract search terms from query.""" # Simple tokenization, lowercase terms = re.findall(r"\b\w+\b", query.lower()) # Filter out common stop words stop_words = {"the", "a", "an", "in", "on", "for", "and", "or", "of", "to"} return [t for t in terms if t not in stop_words and len(t) > 2] def _filter_by_keywords( self, papers: list[dict[str, Any]], terms: list[str], max_results: int ) -> list[dict[str, Any]]: """Filter papers that contain query terms in title or abstract.""" scored_papers = [] for paper in papers: title = paper.get("title", "").lower() abstract = paper.get("abstract", "").lower() text = f"{title} {abstract}" # Count matching terms matches = sum(1 for term in terms if term in text) if matches > 0: scored_papers.append((matches, paper)) # Sort by match count (descending) scored_papers.sort(key=lambda x: x[0], reverse=True) return [paper for _, paper in scored_papers[:max_results]] def _paper_to_evidence(self, paper: dict[str, Any]) -> Evidence: """Convert a preprint paper to Evidence.""" doi = paper.get("doi", "") title = paper.get("title", "Untitled") authors_str = paper.get("authors", "Unknown") date = paper.get("date", "Unknown") abstract = paper.get("abstract", "No abstract available.") category = paper.get("category", "") # Parse authors (format: "Smith, J; Jones, A") authors = [a.strip() for a in authors_str.split(";")][:5] # Truncate abstract if needed truncated_abstract = abstract[:1800] suffix = "..." if len(abstract) > 1800 else "" # Note this is a preprint in the content content = ( f"[PREPRINT - Not peer-reviewed] {truncated_abstract}{suffix} Category: {category}." ) return Evidence( content=content[:2000], citation=Citation( source="biorxiv", title=title[:500], url=f"https://doi.org/{doi}" if doi else "https://www.medrxiv.org/", date=date, authors=authors, ), relevance=0.75, # Slightly lower than peer-reviewed )