Spaces:
Running
Running
Merge pull request #21 from The-Obstacle-Is-The-Way/feat/phase13-modal-integration
Browse files- AGENTS.md +27 -11
- CLAUDE.md +27 -11
- GEMINI.md +57 -13
- examples/modal_demo/run_analysis.py +64 -0
- examples/modal_demo/verify_sandbox.py +74 -271
- src/agents/analysis_agent.py +43 -277
- src/app.py +17 -0
- src/mcp_tools.py +69 -0
- src/orchestrator.py +63 -0
- src/services/statistical_analyzer.py +255 -0
- src/utils/config.py +14 -0
- src/utils/models.py +5 -1
- tests/integration/test_modal.py +58 -0
- tests/unit/services/test_statistical_analyzer.py +104 -0
AGENTS.md
CHANGED
|
@@ -4,7 +4,9 @@ This file provides guidance to AI agents when working with code in this reposito
|
|
| 4 |
|
| 5 |
## Project Overview
|
| 6 |
|
| 7 |
-
DeepCritical is an AI-native drug repurposing research agent for a HuggingFace hackathon. It uses a search-and-judge loop to autonomously search biomedical databases (PubMed) and synthesize evidence for queries like "What existing drugs might help treat long COVID fatigue?".
|
|
|
|
|
|
|
| 8 |
|
| 9 |
## Development Commands
|
| 10 |
|
|
@@ -33,45 +35,53 @@ uv run pytest -m integration
|
|
| 33 |
|
| 34 |
**Pattern**: Search-and-judge loop with multi-tool orchestration.
|
| 35 |
|
| 36 |
-
```
|
| 37 |
User Question → Orchestrator
|
| 38 |
↓
|
| 39 |
Search Loop:
|
| 40 |
-
1. Query PubMed
|
| 41 |
2. Gather evidence
|
| 42 |
3. Judge quality ("Do we have enough?")
|
| 43 |
4. If NO → Refine query, search more
|
| 44 |
-
5. If YES → Synthesize findings
|
| 45 |
↓
|
| 46 |
Research Report with Citations
|
| 47 |
```
|
| 48 |
|
| 49 |
**Key Components**:
|
|
|
|
| 50 |
- `src/orchestrator.py` - Main agent loop
|
| 51 |
- `src/tools/pubmed.py` - PubMed E-utilities search
|
|
|
|
|
|
|
|
|
|
| 52 |
- `src/tools/search_handler.py` - Scatter-gather orchestration
|
| 53 |
- `src/services/embeddings.py` - Semantic search & deduplication (ChromaDB)
|
|
|
|
| 54 |
- `src/agent_factory/judges.py` - LLM-based evidence assessment
|
| 55 |
- `src/agents/` - Magentic multi-agent mode (SearchAgent, JudgeAgent, etc.)
|
|
|
|
| 56 |
- `src/utils/config.py` - Pydantic Settings (loads from `.env`)
|
| 57 |
- `src/utils/models.py` - Evidence, Citation, SearchResult models
|
| 58 |
- `src/utils/exceptions.py` - Exception hierarchy
|
| 59 |
-
- `src/app.py` - Gradio UI (HuggingFace Spaces)
|
| 60 |
|
| 61 |
**Break Conditions**: Judge approval, token budget (50K max), or max iterations (default 10).
|
| 62 |
|
| 63 |
## Configuration
|
| 64 |
|
| 65 |
Settings via pydantic-settings from `.env`:
|
|
|
|
| 66 |
- `LLM_PROVIDER`: "openai" or "anthropic"
|
| 67 |
- `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`: LLM keys
|
| 68 |
- `NCBI_API_KEY`: Optional, for higher PubMed rate limits
|
|
|
|
| 69 |
- `MAX_ITERATIONS`: 1-50, default 10
|
| 70 |
- `LOG_LEVEL`: DEBUG, INFO, WARNING, ERROR
|
| 71 |
|
| 72 |
## Exception Hierarchy
|
| 73 |
|
| 74 |
-
```
|
| 75 |
DeepCriticalError (base)
|
| 76 |
├── SearchError
|
| 77 |
│ └── RateLimitError
|
|
@@ -95,8 +105,14 @@ DeepCriticalError (base)
|
|
| 95 |
|
| 96 |
## Git Workflow
|
| 97 |
|
| 98 |
-
- `main`: Production-ready
|
| 99 |
-
- `dev`: Development
|
| 100 |
-
- `
|
| 101 |
-
- Remote `
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
## Project Overview
|
| 6 |
|
| 7 |
+
DeepCritical is an AI-native drug repurposing research agent for a HuggingFace hackathon. It uses a search-and-judge loop to autonomously search biomedical databases (PubMed, ClinicalTrials.gov, bioRxiv) and synthesize evidence for queries like "What existing drugs might help treat long COVID fatigue?".
|
| 8 |
+
|
| 9 |
+
**Current Status:** Phases 1-13 COMPLETE (Foundation through Modal sandbox integration).
|
| 10 |
|
| 11 |
## Development Commands
|
| 12 |
|
|
|
|
| 35 |
|
| 36 |
**Pattern**: Search-and-judge loop with multi-tool orchestration.
|
| 37 |
|
| 38 |
+
```text
|
| 39 |
User Question → Orchestrator
|
| 40 |
↓
|
| 41 |
Search Loop:
|
| 42 |
+
1. Query PubMed, ClinicalTrials.gov, bioRxiv
|
| 43 |
2. Gather evidence
|
| 44 |
3. Judge quality ("Do we have enough?")
|
| 45 |
4. If NO → Refine query, search more
|
| 46 |
+
5. If YES → Synthesize findings (+ optional Modal analysis)
|
| 47 |
↓
|
| 48 |
Research Report with Citations
|
| 49 |
```
|
| 50 |
|
| 51 |
**Key Components**:
|
| 52 |
+
|
| 53 |
- `src/orchestrator.py` - Main agent loop
|
| 54 |
- `src/tools/pubmed.py` - PubMed E-utilities search
|
| 55 |
+
- `src/tools/clinicaltrials.py` - ClinicalTrials.gov API
|
| 56 |
+
- `src/tools/biorxiv.py` - bioRxiv/medRxiv preprint search
|
| 57 |
+
- `src/tools/code_execution.py` - Modal sandbox execution
|
| 58 |
- `src/tools/search_handler.py` - Scatter-gather orchestration
|
| 59 |
- `src/services/embeddings.py` - Semantic search & deduplication (ChromaDB)
|
| 60 |
+
- `src/services/statistical_analyzer.py` - Statistical analysis via Modal
|
| 61 |
- `src/agent_factory/judges.py` - LLM-based evidence assessment
|
| 62 |
- `src/agents/` - Magentic multi-agent mode (SearchAgent, JudgeAgent, etc.)
|
| 63 |
+
- `src/mcp_tools.py` - MCP tool wrappers for Claude Desktop
|
| 64 |
- `src/utils/config.py` - Pydantic Settings (loads from `.env`)
|
| 65 |
- `src/utils/models.py` - Evidence, Citation, SearchResult models
|
| 66 |
- `src/utils/exceptions.py` - Exception hierarchy
|
| 67 |
+
- `src/app.py` - Gradio UI with MCP server (HuggingFace Spaces)
|
| 68 |
|
| 69 |
**Break Conditions**: Judge approval, token budget (50K max), or max iterations (default 10).
|
| 70 |
|
| 71 |
## Configuration
|
| 72 |
|
| 73 |
Settings via pydantic-settings from `.env`:
|
| 74 |
+
|
| 75 |
- `LLM_PROVIDER`: "openai" or "anthropic"
|
| 76 |
- `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`: LLM keys
|
| 77 |
- `NCBI_API_KEY`: Optional, for higher PubMed rate limits
|
| 78 |
+
- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET`: For Modal sandbox (optional)
|
| 79 |
- `MAX_ITERATIONS`: 1-50, default 10
|
| 80 |
- `LOG_LEVEL`: DEBUG, INFO, WARNING, ERROR
|
| 81 |
|
| 82 |
## Exception Hierarchy
|
| 83 |
|
| 84 |
+
```text
|
| 85 |
DeepCriticalError (base)
|
| 86 |
├── SearchError
|
| 87 |
│ └── RateLimitError
|
|
|
|
| 105 |
|
| 106 |
## Git Workflow
|
| 107 |
|
| 108 |
+
- `main`: Production-ready (GitHub)
|
| 109 |
+
- `dev`: Development integration (GitHub)
|
| 110 |
+
- Remote `origin`: GitHub (source of truth for PRs/code review)
|
| 111 |
+
- Remote `huggingface-upstream`: HuggingFace Spaces (deployment target)
|
| 112 |
+
|
| 113 |
+
**HuggingFace Spaces Collaboration:**
|
| 114 |
+
|
| 115 |
+
- Each contributor should use their own dev branch: `yourname-dev` (e.g., `vcms-dev`, `mario-dev`)
|
| 116 |
+
- **DO NOT push directly to `main` or `dev` on HuggingFace** - these can be overwritten easily
|
| 117 |
+
- GitHub is the source of truth; HuggingFace is for deployment/demo
|
| 118 |
+
- Consider using git hooks to prevent accidental pushes to protected branches
|
CLAUDE.md
CHANGED
|
@@ -4,7 +4,9 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
| 4 |
|
| 5 |
## Project Overview
|
| 6 |
|
| 7 |
-
DeepCritical is an AI-native drug repurposing research agent for a HuggingFace hackathon. It uses a search-and-judge loop to autonomously search biomedical databases (PubMed) and synthesize evidence for queries like "What existing drugs might help treat long COVID fatigue?".
|
|
|
|
|
|
|
| 8 |
|
| 9 |
## Development Commands
|
| 10 |
|
|
@@ -33,45 +35,53 @@ uv run pytest -m integration
|
|
| 33 |
|
| 34 |
**Pattern**: Search-and-judge loop with multi-tool orchestration.
|
| 35 |
|
| 36 |
-
```
|
| 37 |
User Question → Orchestrator
|
| 38 |
↓
|
| 39 |
Search Loop:
|
| 40 |
-
1. Query PubMed
|
| 41 |
2. Gather evidence
|
| 42 |
3. Judge quality ("Do we have enough?")
|
| 43 |
4. If NO → Refine query, search more
|
| 44 |
-
5. If YES → Synthesize findings
|
| 45 |
↓
|
| 46 |
Research Report with Citations
|
| 47 |
```
|
| 48 |
|
| 49 |
**Key Components**:
|
|
|
|
| 50 |
- `src/orchestrator.py` - Main agent loop
|
| 51 |
- `src/tools/pubmed.py` - PubMed E-utilities search
|
|
|
|
|
|
|
|
|
|
| 52 |
- `src/tools/search_handler.py` - Scatter-gather orchestration
|
| 53 |
- `src/services/embeddings.py` - Semantic search & deduplication (ChromaDB)
|
|
|
|
| 54 |
- `src/agent_factory/judges.py` - LLM-based evidence assessment
|
| 55 |
- `src/agents/` - Magentic multi-agent mode (SearchAgent, JudgeAgent, etc.)
|
|
|
|
| 56 |
- `src/utils/config.py` - Pydantic Settings (loads from `.env`)
|
| 57 |
- `src/utils/models.py` - Evidence, Citation, SearchResult models
|
| 58 |
- `src/utils/exceptions.py` - Exception hierarchy
|
| 59 |
-
- `src/app.py` - Gradio UI (HuggingFace Spaces)
|
| 60 |
|
| 61 |
**Break Conditions**: Judge approval, token budget (50K max), or max iterations (default 10).
|
| 62 |
|
| 63 |
## Configuration
|
| 64 |
|
| 65 |
Settings via pydantic-settings from `.env`:
|
|
|
|
| 66 |
- `LLM_PROVIDER`: "openai" or "anthropic"
|
| 67 |
- `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`: LLM keys
|
| 68 |
- `NCBI_API_KEY`: Optional, for higher PubMed rate limits
|
|
|
|
| 69 |
- `MAX_ITERATIONS`: 1-50, default 10
|
| 70 |
- `LOG_LEVEL`: DEBUG, INFO, WARNING, ERROR
|
| 71 |
|
| 72 |
## Exception Hierarchy
|
| 73 |
|
| 74 |
-
```
|
| 75 |
DeepCriticalError (base)
|
| 76 |
├── SearchError
|
| 77 |
│ └── RateLimitError
|
|
@@ -88,8 +98,14 @@ DeepCriticalError (base)
|
|
| 88 |
|
| 89 |
## Git Workflow
|
| 90 |
|
| 91 |
-
- `main`: Production-ready
|
| 92 |
-
- `dev`: Development
|
| 93 |
-
- `
|
| 94 |
-
- Remote `
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
## Project Overview
|
| 6 |
|
| 7 |
+
DeepCritical is an AI-native drug repurposing research agent for a HuggingFace hackathon. It uses a search-and-judge loop to autonomously search biomedical databases (PubMed, ClinicalTrials.gov, bioRxiv) and synthesize evidence for queries like "What existing drugs might help treat long COVID fatigue?".
|
| 8 |
+
|
| 9 |
+
**Current Status:** Phases 1-13 COMPLETE (Foundation through Modal sandbox integration).
|
| 10 |
|
| 11 |
## Development Commands
|
| 12 |
|
|
|
|
| 35 |
|
| 36 |
**Pattern**: Search-and-judge loop with multi-tool orchestration.
|
| 37 |
|
| 38 |
+
```text
|
| 39 |
User Question → Orchestrator
|
| 40 |
↓
|
| 41 |
Search Loop:
|
| 42 |
+
1. Query PubMed, ClinicalTrials.gov, bioRxiv
|
| 43 |
2. Gather evidence
|
| 44 |
3. Judge quality ("Do we have enough?")
|
| 45 |
4. If NO → Refine query, search more
|
| 46 |
+
5. If YES → Synthesize findings (+ optional Modal analysis)
|
| 47 |
↓
|
| 48 |
Research Report with Citations
|
| 49 |
```
|
| 50 |
|
| 51 |
**Key Components**:
|
| 52 |
+
|
| 53 |
- `src/orchestrator.py` - Main agent loop
|
| 54 |
- `src/tools/pubmed.py` - PubMed E-utilities search
|
| 55 |
+
- `src/tools/clinicaltrials.py` - ClinicalTrials.gov API
|
| 56 |
+
- `src/tools/biorxiv.py` - bioRxiv/medRxiv preprint search
|
| 57 |
+
- `src/tools/code_execution.py` - Modal sandbox execution
|
| 58 |
- `src/tools/search_handler.py` - Scatter-gather orchestration
|
| 59 |
- `src/services/embeddings.py` - Semantic search & deduplication (ChromaDB)
|
| 60 |
+
- `src/services/statistical_analyzer.py` - Statistical analysis via Modal
|
| 61 |
- `src/agent_factory/judges.py` - LLM-based evidence assessment
|
| 62 |
- `src/agents/` - Magentic multi-agent mode (SearchAgent, JudgeAgent, etc.)
|
| 63 |
+
- `src/mcp_tools.py` - MCP tool wrappers for Claude Desktop
|
| 64 |
- `src/utils/config.py` - Pydantic Settings (loads from `.env`)
|
| 65 |
- `src/utils/models.py` - Evidence, Citation, SearchResult models
|
| 66 |
- `src/utils/exceptions.py` - Exception hierarchy
|
| 67 |
+
- `src/app.py` - Gradio UI with MCP server (HuggingFace Spaces)
|
| 68 |
|
| 69 |
**Break Conditions**: Judge approval, token budget (50K max), or max iterations (default 10).
|
| 70 |
|
| 71 |
## Configuration
|
| 72 |
|
| 73 |
Settings via pydantic-settings from `.env`:
|
| 74 |
+
|
| 75 |
- `LLM_PROVIDER`: "openai" or "anthropic"
|
| 76 |
- `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`: LLM keys
|
| 77 |
- `NCBI_API_KEY`: Optional, for higher PubMed rate limits
|
| 78 |
+
- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET`: For Modal sandbox (optional)
|
| 79 |
- `MAX_ITERATIONS`: 1-50, default 10
|
| 80 |
- `LOG_LEVEL`: DEBUG, INFO, WARNING, ERROR
|
| 81 |
|
| 82 |
## Exception Hierarchy
|
| 83 |
|
| 84 |
+
```text
|
| 85 |
DeepCriticalError (base)
|
| 86 |
├── SearchError
|
| 87 |
│ └── RateLimitError
|
|
|
|
| 98 |
|
| 99 |
## Git Workflow
|
| 100 |
|
| 101 |
+
- `main`: Production-ready (GitHub)
|
| 102 |
+
- `dev`: Development integration (GitHub)
|
| 103 |
+
- Remote `origin`: GitHub (source of truth for PRs/code review)
|
| 104 |
+
- Remote `huggingface-upstream`: HuggingFace Spaces (deployment target)
|
| 105 |
+
|
| 106 |
+
**HuggingFace Spaces Collaboration:**
|
| 107 |
+
|
| 108 |
+
- Each contributor should use their own dev branch: `yourname-dev` (e.g., `vcms-dev`, `mario-dev`)
|
| 109 |
+
- **DO NOT push directly to `main` or `dev` on HuggingFace** - these can be overwritten easily
|
| 110 |
+
- GitHub is the source of truth; HuggingFace is for deployment/demo
|
| 111 |
+
- Consider using git hooks to prevent accidental pushes to protected branches
|
GEMINI.md
CHANGED
|
@@ -1,27 +1,31 @@
|
|
| 1 |
# DeepCritical Context
|
| 2 |
|
| 3 |
## Project Overview
|
|
|
|
| 4 |
**DeepCritical** is an AI-native Medical Drug Repurposing Research Agent.
|
| 5 |
-
**Goal:** To accelerate the discovery of new uses for existing drugs by intelligently searching biomedical literature (PubMed), evaluating evidence, and hypothesizing potential applications.
|
| 6 |
|
| 7 |
**Architecture:**
|
| 8 |
The project follows a **Vertical Slice Architecture** (Search -> Judge -> Orchestrator) and adheres to **Strict TDD** (Test-Driven Development).
|
| 9 |
|
| 10 |
**Current Status:**
|
| 11 |
-
|
| 12 |
-
- **
|
| 13 |
-
- **
|
|
|
|
|
|
|
| 14 |
|
| 15 |
## Tech Stack & Tooling
|
|
|
|
| 16 |
- **Language:** Python 3.11 (Pinned)
|
| 17 |
- **Package Manager:** `uv` (Rust-based, extremely fast)
|
| 18 |
-
- **Frameworks:** `pydantic`, `pydantic-ai`, `httpx`, `gradio`
|
| 19 |
- **Vector DB:** `chromadb` with `sentence-transformers` for semantic search
|
|
|
|
| 20 |
- **Testing:** `pytest`, `pytest-asyncio`, `respx` (for mocking)
|
| 21 |
- **Quality:** `ruff` (linting/formatting), `mypy` (strict type checking), `pre-commit`
|
| 22 |
|
| 23 |
## Building & Running
|
| 24 |
-
We use a `Makefile` to standardize developer commands.
|
| 25 |
|
| 26 |
| Command | Description |
|
| 27 |
| :--- | :--- |
|
|
@@ -34,21 +38,61 @@ We use a `Makefile` to standardize developer commands.
|
|
| 34 |
| `make clean` | Clean up cache and artifacts. |
|
| 35 |
|
| 36 |
## Directory Structure
|
|
|
|
| 37 |
- `src/`: Source code
|
| 38 |
- `utils/`: Shared utilities (`config.py`, `exceptions.py`, `models.py`)
|
| 39 |
-
- `tools/`: Search tools (`pubmed.py`, `
|
| 40 |
-
- `services/`: Services (`embeddings.py`
|
| 41 |
- `agents/`: Magentic multi-agent mode agents
|
| 42 |
- `agent_factory/`: Agent definitions (judges, prompts)
|
|
|
|
|
|
|
| 43 |
- `tests/`: Test suite
|
| 44 |
- `unit/`: Isolated unit tests (Mocked)
|
| 45 |
- `integration/`: Real API tests (Marked as slow/integration)
|
| 46 |
- `docs/`: Documentation and Implementation Specs
|
| 47 |
- `examples/`: Working demos for each phase
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
## Development Conventions
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# DeepCritical Context
|
| 2 |
|
| 3 |
## Project Overview
|
| 4 |
+
|
| 5 |
**DeepCritical** is an AI-native Medical Drug Repurposing Research Agent.
|
| 6 |
+
**Goal:** To accelerate the discovery of new uses for existing drugs by intelligently searching biomedical literature (PubMed, ClinicalTrials.gov, bioRxiv), evaluating evidence, and hypothesizing potential applications.
|
| 7 |
|
| 8 |
**Architecture:**
|
| 9 |
The project follows a **Vertical Slice Architecture** (Search -> Judge -> Orchestrator) and adheres to **Strict TDD** (Test-Driven Development).
|
| 10 |
|
| 11 |
**Current Status:**
|
| 12 |
+
|
| 13 |
+
- **Phases 1-9:** COMPLETE. Foundation, Search, Judge, UI, Orchestrator, Embeddings, Hypothesis, Report, Cleanup.
|
| 14 |
+
- **Phases 10-11:** COMPLETE. ClinicalTrials.gov and bioRxiv integration.
|
| 15 |
+
- **Phase 12:** COMPLETE. MCP Server integration (Gradio MCP at `/gradio_api/mcp/`).
|
| 16 |
+
- **Phase 13:** COMPLETE. Modal sandbox for statistical analysis.
|
| 17 |
|
| 18 |
## Tech Stack & Tooling
|
| 19 |
+
|
| 20 |
- **Language:** Python 3.11 (Pinned)
|
| 21 |
- **Package Manager:** `uv` (Rust-based, extremely fast)
|
| 22 |
+
- **Frameworks:** `pydantic`, `pydantic-ai`, `httpx`, `gradio[mcp]`
|
| 23 |
- **Vector DB:** `chromadb` with `sentence-transformers` for semantic search
|
| 24 |
+
- **Code Execution:** `modal` for secure sandboxed Python execution
|
| 25 |
- **Testing:** `pytest`, `pytest-asyncio`, `respx` (for mocking)
|
| 26 |
- **Quality:** `ruff` (linting/formatting), `mypy` (strict type checking), `pre-commit`
|
| 27 |
|
| 28 |
## Building & Running
|
|
|
|
| 29 |
|
| 30 |
| Command | Description |
|
| 31 |
| :--- | :--- |
|
|
|
|
| 38 |
| `make clean` | Clean up cache and artifacts. |
|
| 39 |
|
| 40 |
## Directory Structure
|
| 41 |
+
|
| 42 |
- `src/`: Source code
|
| 43 |
- `utils/`: Shared utilities (`config.py`, `exceptions.py`, `models.py`)
|
| 44 |
+
- `tools/`: Search tools (`pubmed.py`, `clinicaltrials.py`, `biorxiv.py`, `code_execution.py`)
|
| 45 |
+
- `services/`: Services (`embeddings.py`, `statistical_analyzer.py`)
|
| 46 |
- `agents/`: Magentic multi-agent mode agents
|
| 47 |
- `agent_factory/`: Agent definitions (judges, prompts)
|
| 48 |
+
- `mcp_tools.py`: MCP tool wrappers for Claude Desktop integration
|
| 49 |
+
- `app.py`: Gradio UI with MCP server
|
| 50 |
- `tests/`: Test suite
|
| 51 |
- `unit/`: Isolated unit tests (Mocked)
|
| 52 |
- `integration/`: Real API tests (Marked as slow/integration)
|
| 53 |
- `docs/`: Documentation and Implementation Specs
|
| 54 |
- `examples/`: Working demos for each phase
|
| 55 |
|
| 56 |
+
## Key Components
|
| 57 |
+
|
| 58 |
+
- `src/orchestrator.py` - Main agent loop
|
| 59 |
+
- `src/tools/pubmed.py` - PubMed E-utilities search
|
| 60 |
+
- `src/tools/clinicaltrials.py` - ClinicalTrials.gov API
|
| 61 |
+
- `src/tools/biorxiv.py` - bioRxiv/medRxiv preprint search
|
| 62 |
+
- `src/tools/code_execution.py` - Modal sandbox execution
|
| 63 |
+
- `src/services/statistical_analyzer.py` - Statistical analysis via Modal
|
| 64 |
+
- `src/mcp_tools.py` - MCP tool wrappers
|
| 65 |
+
- `src/app.py` - Gradio UI (HuggingFace Spaces) with MCP server
|
| 66 |
+
|
| 67 |
+
## Configuration
|
| 68 |
+
|
| 69 |
+
Settings via pydantic-settings from `.env`:
|
| 70 |
+
|
| 71 |
+
- `LLM_PROVIDER`: "openai" or "anthropic"
|
| 72 |
+
- `OPENAI_API_KEY` / `ANTHROPIC_API_KEY`: LLM keys
|
| 73 |
+
- `NCBI_API_KEY`: Optional, for higher PubMed rate limits
|
| 74 |
+
- `MODAL_TOKEN_ID` / `MODAL_TOKEN_SECRET`: For Modal sandbox (optional)
|
| 75 |
+
- `MAX_ITERATIONS`: 1-50, default 10
|
| 76 |
+
- `LOG_LEVEL`: DEBUG, INFO, WARNING, ERROR
|
| 77 |
+
|
| 78 |
## Development Conventions
|
| 79 |
+
|
| 80 |
+
1. **Strict TDD:** Write failing tests in `tests/unit/` *before* implementing logic in `src/`.
|
| 81 |
+
2. **Type Safety:** All code must pass `mypy --strict`. Use Pydantic models for data exchange.
|
| 82 |
+
3. **Linting:** Zero tolerance for Ruff errors.
|
| 83 |
+
4. **Mocking:** Use `respx` or `unittest.mock` for all external API calls in unit tests.
|
| 84 |
+
5. **Vertical Slices:** Implement features end-to-end rather than layer-by-layer.
|
| 85 |
+
|
| 86 |
+
## Git Workflow
|
| 87 |
+
|
| 88 |
+
- `main`: Production-ready (GitHub)
|
| 89 |
+
- `dev`: Development integration (GitHub)
|
| 90 |
+
- Remote `origin`: GitHub (source of truth for PRs/code review)
|
| 91 |
+
- Remote `huggingface-upstream`: HuggingFace Spaces (deployment target)
|
| 92 |
+
|
| 93 |
+
**HuggingFace Spaces Collaboration:**
|
| 94 |
+
|
| 95 |
+
- Each contributor should use their own dev branch: `yourname-dev` (e.g., `vcms-dev`, `mario-dev`)
|
| 96 |
+
- **DO NOT push directly to `main` or `dev` on HuggingFace** - these can be overwritten easily
|
| 97 |
+
- GitHub is the source of truth; HuggingFace is for deployment/demo
|
| 98 |
+
- Consider using git hooks to prevent accidental pushes to protected branches
|
examples/modal_demo/run_analysis.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Demo: Modal-powered statistical analysis.
|
| 3 |
+
|
| 4 |
+
This script uses StatisticalAnalyzer directly (NO agent_framework dependency).
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
uv run python examples/modal_demo/run_analysis.py "metformin alzheimer"
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import asyncio
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
|
| 15 |
+
from src.services.statistical_analyzer import get_statistical_analyzer
|
| 16 |
+
from src.tools.pubmed import PubMedTool
|
| 17 |
+
from src.utils.config import settings
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
async def main() -> None:
|
| 21 |
+
"""Run the Modal analysis demo."""
|
| 22 |
+
parser = argparse.ArgumentParser(description="Modal Analysis Demo")
|
| 23 |
+
parser.add_argument("query", help="Research query")
|
| 24 |
+
args = parser.parse_args()
|
| 25 |
+
|
| 26 |
+
if not settings.modal_available:
|
| 27 |
+
print("Error: Modal credentials not configured.")
|
| 28 |
+
sys.exit(1)
|
| 29 |
+
|
| 30 |
+
if not (os.getenv("OPENAI_API_KEY") or os.getenv("ANTHROPIC_API_KEY")):
|
| 31 |
+
print("Error: No LLM API key found.")
|
| 32 |
+
sys.exit(1)
|
| 33 |
+
|
| 34 |
+
print(f"\n{'=' * 60}")
|
| 35 |
+
print("DeepCritical Modal Analysis Demo")
|
| 36 |
+
print(f"Query: {args.query}")
|
| 37 |
+
print(f"{ '=' * 60}\n")
|
| 38 |
+
|
| 39 |
+
# Step 1: Gather Evidence
|
| 40 |
+
print("Step 1: Gathering evidence from PubMed...")
|
| 41 |
+
pubmed = PubMedTool()
|
| 42 |
+
evidence = await pubmed.search(args.query, max_results=5)
|
| 43 |
+
print(f" Found {len(evidence)} papers\n")
|
| 44 |
+
|
| 45 |
+
# Step 2: Run Modal Analysis
|
| 46 |
+
print("Step 2: Running statistical analysis in Modal sandbox...")
|
| 47 |
+
analyzer = get_statistical_analyzer()
|
| 48 |
+
result = await analyzer.analyze(query=args.query, evidence=evidence)
|
| 49 |
+
|
| 50 |
+
# Step 3: Display Results
|
| 51 |
+
print("\n" + "=" * 60)
|
| 52 |
+
print("ANALYSIS RESULTS")
|
| 53 |
+
print("=" * 60)
|
| 54 |
+
print(f"\nVerdict: {result.verdict}")
|
| 55 |
+
print(f"Confidence: {result.confidence:.0%}")
|
| 56 |
+
print("\nKey Findings:")
|
| 57 |
+
for finding in result.key_findings:
|
| 58 |
+
print(f" - {finding}")
|
| 59 |
+
|
| 60 |
+
print("\n[Demo Complete - Code executed in Modal, not locally]")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
asyncio.run(main())
|
examples/modal_demo/verify_sandbox.py
CHANGED
|
@@ -1,298 +1,101 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
-
This script
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
import sys
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
|
| 9 |
-
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 10 |
-
|
| 11 |
-
from src.tools.code_execution import SANDBOX_LIBRARIES, get_code_executor
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def test_1_hostname_check():
|
| 15 |
-
"""Test 1: Check hostname - should be different in sandbox."""
|
| 16 |
-
print("\n" + "=" * 60)
|
| 17 |
-
print("TEST 1: Hostname Check")
|
| 18 |
-
print("=" * 60)
|
| 19 |
-
|
| 20 |
-
executor = get_code_executor()
|
| 21 |
-
|
| 22 |
-
# Get local hostname
|
| 23 |
-
import socket
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
# Get sandbox hostname
|
| 29 |
-
code = """
|
| 30 |
-
import socket
|
| 31 |
-
hostname = socket.gethostname()
|
| 32 |
-
print(f"Sandbox hostname: {hostname}")
|
| 33 |
"""
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
if local_hostname in result["stdout"]:
|
| 39 |
-
print("⚠️ WARNING: Hostnames match - might be running locally!")
|
| 40 |
-
return False
|
| 41 |
-
else:
|
| 42 |
-
print("✅ SUCCESS: Different hostnames - running in sandbox!")
|
| 43 |
-
return True
|
| 44 |
-
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
print("\n" + "=" * 60)
|
| 49 |
-
print("TEST 2: File System Isolation")
|
| 50 |
-
print("=" * 60)
|
| 51 |
-
|
| 52 |
-
executor = get_code_executor()
|
| 53 |
-
|
| 54 |
-
# Try to read our own source file
|
| 55 |
-
local_file = Path(__file__).resolve()
|
| 56 |
-
print(f"Local file exists: {local_file}")
|
| 57 |
-
print(f"Can read locally: {local_file.exists()}")
|
| 58 |
-
|
| 59 |
-
# Try to access it from sandbox (use POSIX path for Windows compatibility)
|
| 60 |
-
code = f"""
|
| 61 |
-
from pathlib import Path
|
| 62 |
-
file_path = Path("{local_file.as_posix()}")
|
| 63 |
-
exists = file_path.exists()
|
| 64 |
-
print(f"File exists in sandbox: {{exists}}")
|
| 65 |
-
if exists:
|
| 66 |
-
print("⚠️ Can access local filesystem!")
|
| 67 |
-
else:
|
| 68 |
-
print("✅ Filesystem is isolated!")
|
| 69 |
-
"""
|
| 70 |
|
| 71 |
-
result = executor.execute(code)
|
| 72 |
-
print(f"\n{result['stdout']}")
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
| 77 |
else:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def
|
| 83 |
-
"""
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
print(
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
print(
|
| 99 |
-
print(f
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
"""
|
| 107 |
-
|
| 108 |
-
result = executor.execute(code)
|
| 109 |
-
print(f"\n{result['stdout']}")
|
| 110 |
-
|
| 111 |
-
if "In container: True" in result["stdout"]:
|
| 112 |
-
print("\n✅ SUCCESS: Running in containerized environment!")
|
| 113 |
-
return True
|
| 114 |
-
else:
|
| 115 |
-
print("\n⚠️ WARNING: Not detecting container environment")
|
| 116 |
-
return False
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
def test_4_library_versions():
|
| 120 |
-
"""Test 4: Check if scientific libraries match Modal image specs."""
|
| 121 |
-
print("\n" + "=" * 60)
|
| 122 |
-
print("TEST 4: Library Versions (Should match Modal image)")
|
| 123 |
-
print("=" * 60)
|
| 124 |
-
|
| 125 |
-
executor = get_code_executor()
|
| 126 |
-
|
| 127 |
-
code = """
|
| 128 |
import pandas as pd
|
| 129 |
import numpy as np
|
| 130 |
import scipy
|
| 131 |
-
import matplotlib
|
| 132 |
-
import sklearn
|
| 133 |
-
import statsmodels
|
| 134 |
-
|
| 135 |
print(f"pandas: {pd.__version__}")
|
| 136 |
print(f"numpy: {np.__version__}")
|
| 137 |
print(f"scipy: {scipy.__version__}")
|
| 138 |
-
print(f"matplotlib: {matplotlib.__version__}")
|
| 139 |
-
print(f"scikit-learn: {sklearn.__version__}")
|
| 140 |
-
print(f"statsmodels: {statsmodels.__version__}")
|
| 141 |
"""
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
expected_versions = {
|
| 148 |
-
f"pandas: {SANDBOX_LIBRARIES['pandas']}": True,
|
| 149 |
-
f"numpy: {SANDBOX_LIBRARIES['numpy']}": True,
|
| 150 |
-
f"scipy: {SANDBOX_LIBRARIES['scipy']}": True,
|
| 151 |
-
}
|
| 152 |
-
|
| 153 |
-
matches = 0
|
| 154 |
-
for expected in expected_versions:
|
| 155 |
-
if expected in result["stdout"]:
|
| 156 |
-
matches += 1
|
| 157 |
-
print(f"✅ {expected}")
|
| 158 |
-
|
| 159 |
-
if matches >= 2:
|
| 160 |
-
print(f"\n✅ SUCCESS: Library versions match Modal image spec ({matches}/3)")
|
| 161 |
-
return True
|
| 162 |
-
else:
|
| 163 |
-
print(f"\n⚠️ WARNING: Library versions don't match ({matches}/3)")
|
| 164 |
-
return False
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
def test_5_destructive_operations():
|
| 168 |
-
"""Test 5: Try destructive operations that would be dangerous locally."""
|
| 169 |
-
print("\n" + "=" * 60)
|
| 170 |
-
print("TEST 5: Destructive Operations (Safe in sandbox)")
|
| 171 |
-
print("=" * 60)
|
| 172 |
-
|
| 173 |
-
executor = get_code_executor()
|
| 174 |
-
|
| 175 |
-
code = """
|
| 176 |
-
import os
|
| 177 |
-
import tempfile
|
| 178 |
-
|
| 179 |
-
# Try to write to /tmp (should work)
|
| 180 |
-
tmp_file = "/tmp/test_modal_sandbox.txt"
|
| 181 |
-
try:
|
| 182 |
-
with open(tmp_file, 'w') as f:
|
| 183 |
-
f.write("Test write to /tmp")
|
| 184 |
-
print(f"✅ Can write to /tmp: {tmp_file}")
|
| 185 |
-
os.remove(tmp_file)
|
| 186 |
-
print("✅ Can delete from /tmp")
|
| 187 |
-
except Exception as e:
|
| 188 |
-
print(f"❌ Error with /tmp: {e}")
|
| 189 |
-
|
| 190 |
-
# Try to write to /root (might fail due to permissions)
|
| 191 |
try:
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
print(
|
| 196 |
-
os.remove(test_file)
|
| 197 |
-
except Exception as e:
|
| 198 |
-
print(f"⚠️ Cannot write to /root: {e}")
|
| 199 |
-
|
| 200 |
-
# Check what user we're running as
|
| 201 |
-
print(f"Running as UID: {os.getuid()}")
|
| 202 |
-
print(f"Running as GID: {os.getgid()}")
|
| 203 |
"""
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
return True
|
| 211 |
-
else:
|
| 212 |
-
print("\n⚠️ WARNING: Unexpected filesystem behavior")
|
| 213 |
-
return False
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
def test_6_network_isolation():
|
| 217 |
-
"""Test 6: Check network access (should be allowed by default in our config)."""
|
| 218 |
-
print("\n" + "=" * 60)
|
| 219 |
-
print("TEST 6: Network Access Check")
|
| 220 |
-
print("=" * 60)
|
| 221 |
-
|
| 222 |
-
executor = get_code_executor()
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
|
|
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
print(f"✅ Can resolve DNS: google.com -> {ip}")
|
| 231 |
-
print("(Network is enabled - can be disabled for security)")
|
| 232 |
-
except Exception as e:
|
| 233 |
-
print(f"❌ Cannot resolve DNS: {e}")
|
| 234 |
-
print("(Network is blocked)")
|
| 235 |
"""
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
return True # Either result is valid
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
def main():
|
| 244 |
-
"""Run all verification tests."""
|
| 245 |
-
print("\n" + "=" * 70)
|
| 246 |
-
print(" " * 15 + "MODAL SANDBOX VERIFICATION")
|
| 247 |
-
print("=" * 70)
|
| 248 |
-
print("\nThese tests verify code is running in Modal sandboxes, not locally.")
|
| 249 |
-
print("=" * 70)
|
| 250 |
-
|
| 251 |
-
tests = [
|
| 252 |
-
("Hostname Isolation", test_1_hostname_check),
|
| 253 |
-
("Filesystem Isolation", test_2_file_system_isolation),
|
| 254 |
-
("Container Detection", test_3_process_information),
|
| 255 |
-
("Library Versions", test_4_library_versions),
|
| 256 |
-
("Destructive Operations", test_5_destructive_operations),
|
| 257 |
-
("Network Access", test_6_network_isolation),
|
| 258 |
-
]
|
| 259 |
-
|
| 260 |
-
results = []
|
| 261 |
-
for name, test_func in tests:
|
| 262 |
-
try:
|
| 263 |
-
passed = test_func()
|
| 264 |
-
results.append((name, passed))
|
| 265 |
-
except Exception as e:
|
| 266 |
-
print(f"\n❌ Test failed with exception: {e}")
|
| 267 |
-
import traceback
|
| 268 |
-
|
| 269 |
-
traceback.print_exc()
|
| 270 |
-
results.append((name, False))
|
| 271 |
-
|
| 272 |
-
# Summary
|
| 273 |
-
print("\n" + "=" * 70)
|
| 274 |
-
print(" " * 25 + "SUMMARY")
|
| 275 |
-
print("=" * 70)
|
| 276 |
-
|
| 277 |
-
passed = sum(1 for _, result in results if result)
|
| 278 |
-
total = len(results)
|
| 279 |
-
|
| 280 |
-
for name, result in results:
|
| 281 |
-
status = "✅ PASS" if result else "❌ FAIL"
|
| 282 |
-
print(f"{status} - {name}")
|
| 283 |
-
|
| 284 |
-
print("=" * 70)
|
| 285 |
-
print(f"\nResults: {passed}/{total} tests passed")
|
| 286 |
-
|
| 287 |
-
if passed >= 4:
|
| 288 |
-
print("\n🎉 Modal sandboxing is working correctly!")
|
| 289 |
-
elif passed >= 2:
|
| 290 |
-
print("\n⚠️ Some tests failed - review output above")
|
| 291 |
-
else:
|
| 292 |
-
print("\n❌ Modal sandboxing may not be working - check configuration")
|
| 293 |
|
| 294 |
-
|
|
|
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
if __name__ == "__main__":
|
| 298 |
-
main()
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Verify that Modal sandbox is properly isolated.
|
| 3 |
|
| 4 |
+
This script proves to judges that code runs in Modal, not locally.
|
| 5 |
+
NO agent_framework dependency - uses only src.tools.code_execution.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
Usage:
|
| 8 |
+
uv run python examples/modal_demo/verify_sandbox.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
import asyncio
|
| 12 |
+
from functools import partial
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
from src.tools.code_execution import CodeExecutionError, get_code_executor
|
| 15 |
+
from src.utils.config import settings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def print_result(result: dict) -> None:
|
| 19 |
+
"""Print execution result, surfacing errors when they occur."""
|
| 20 |
+
if result.get("success"):
|
| 21 |
+
print(f" {result['stdout'].strip()}\n")
|
| 22 |
else:
|
| 23 |
+
error = result.get("error") or result.get("stderr", "").strip() or "Unknown error"
|
| 24 |
+
print(f" ERROR: {error}\n")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def main() -> None:
|
| 28 |
+
"""Verify Modal sandbox isolation."""
|
| 29 |
+
if not settings.modal_available:
|
| 30 |
+
print("Error: Modal credentials not configured.")
|
| 31 |
+
print("Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in .env")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
executor = get_code_executor()
|
| 36 |
+
loop = asyncio.get_running_loop()
|
| 37 |
+
|
| 38 |
+
print("=" * 60)
|
| 39 |
+
print("Modal Sandbox Isolation Verification")
|
| 40 |
+
print("=" * 60 + "\n")
|
| 41 |
+
|
| 42 |
+
# Test 1: Hostname
|
| 43 |
+
print("Test 1: Check hostname (should NOT be your machine)")
|
| 44 |
+
code1 = "import socket; print(f'Hostname: {socket.gethostname()}')"
|
| 45 |
+
result1 = await loop.run_in_executor(None, partial(executor.execute, code1))
|
| 46 |
+
print_result(result1)
|
| 47 |
+
|
| 48 |
+
# Test 2: Scientific libraries
|
| 49 |
+
print("Test 2: Verify scientific libraries")
|
| 50 |
+
code2 = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
import pandas as pd
|
| 52 |
import numpy as np
|
| 53 |
import scipy
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
print(f"pandas: {pd.__version__}")
|
| 55 |
print(f"numpy: {np.__version__}")
|
| 56 |
print(f"scipy: {scipy.__version__}")
|
|
|
|
|
|
|
|
|
|
| 57 |
"""
|
| 58 |
+
result2 = await loop.run_in_executor(None, partial(executor.execute, code2))
|
| 59 |
+
print_result(result2)
|
| 60 |
|
| 61 |
+
# Test 3: Network blocked
|
| 62 |
+
print("Test 3: Verify network isolation")
|
| 63 |
+
code3 = """
|
| 64 |
+
import urllib.request
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
try:
|
| 66 |
+
urllib.request.urlopen("https://google.com", timeout=2)
|
| 67 |
+
print("Network: ALLOWED (unexpected!)")
|
| 68 |
+
except Exception:
|
| 69 |
+
print("Network: BLOCKED (as expected)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
"""
|
| 71 |
+
result3 = await loop.run_in_executor(None, partial(executor.execute, code3))
|
| 72 |
+
print_result(result3)
|
| 73 |
|
| 74 |
+
# Test 4: Real statistics
|
| 75 |
+
print("Test 4: Execute statistical analysis")
|
| 76 |
+
code4 = """
|
| 77 |
+
import pandas as pd
|
| 78 |
+
import scipy.stats as stats
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
data = pd.DataFrame({'effect': [0.42, 0.38, 0.51]})
|
| 81 |
+
mean = data['effect'].mean()
|
| 82 |
+
t_stat, p_val = stats.ttest_1samp(data['effect'], 0)
|
| 83 |
|
| 84 |
+
print(f"Mean Effect: {mean:.3f}")
|
| 85 |
+
print(f"P-value: {p_val:.4f}")
|
| 86 |
+
print(f"Verdict: {'SUPPORTED' if p_val < 0.05 else 'INCONCLUSIVE'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
"""
|
| 88 |
+
result4 = await loop.run_in_executor(None, partial(executor.execute, code4))
|
| 89 |
+
print_result(result4)
|
| 90 |
|
| 91 |
+
print("=" * 60)
|
| 92 |
+
print("All tests complete - Modal sandbox verified!")
|
| 93 |
+
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
except CodeExecutionError as e:
|
| 96 |
+
print(f"Error: Modal code execution failed: {e}")
|
| 97 |
+
print("Hint: Ensure Modal SDK is installed and credentials are valid.")
|
| 98 |
|
| 99 |
|
| 100 |
if __name__ == "__main__":
|
| 101 |
+
asyncio.run(main())
|
src/agents/analysis_agent.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
-
"""Analysis agent for statistical analysis using Modal code execution.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import asyncio
|
| 4 |
from collections.abc import AsyncIterable
|
| 5 |
-
from functools import partial
|
| 6 |
from typing import TYPE_CHECKING, Any
|
| 7 |
|
| 8 |
from agent_framework import (
|
|
@@ -13,47 +16,18 @@ from agent_framework import (
|
|
| 13 |
ChatMessage,
|
| 14 |
Role,
|
| 15 |
)
|
| 16 |
-
from pydantic import BaseModel, Field
|
| 17 |
-
from pydantic_ai import Agent
|
| 18 |
|
| 19 |
-
from src.
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
get_code_executor,
|
| 23 |
-
get_sandbox_library_prompt,
|
| 24 |
)
|
| 25 |
-
from src.utils.models import Evidence
|
| 26 |
|
| 27 |
if TYPE_CHECKING:
|
| 28 |
from src.services.embeddings import EmbeddingService
|
| 29 |
|
| 30 |
|
| 31 |
-
class AnalysisResult(BaseModel):
|
| 32 |
-
"""Result of statistical analysis."""
|
| 33 |
-
|
| 34 |
-
verdict: str = Field(
|
| 35 |
-
description="SUPPORTED, REFUTED, or INCONCLUSIVE",
|
| 36 |
-
)
|
| 37 |
-
confidence: float = Field(ge=0.0, le=1.0, description="Confidence in verdict (0-1)")
|
| 38 |
-
statistical_evidence: str = Field(
|
| 39 |
-
description="Summary of statistical findings from code execution"
|
| 40 |
-
)
|
| 41 |
-
code_generated: str = Field(description="Python code that was executed")
|
| 42 |
-
execution_output: str = Field(description="Output from code execution")
|
| 43 |
-
key_findings: list[str] = Field(default_factory=list, description="Key takeaways from analysis")
|
| 44 |
-
limitations: list[str] = Field(default_factory=list, description="Limitations of the analysis")
|
| 45 |
-
|
| 46 |
-
|
| 47 |
class AnalysisAgent(BaseAgent): # type: ignore[misc]
|
| 48 |
-
"""
|
| 49 |
-
|
| 50 |
-
This agent:
|
| 51 |
-
1. Retrieves relevant evidence using RAG (if available)
|
| 52 |
-
2. Generates Python code for statistical analysis
|
| 53 |
-
3. Executes code in Modal sandbox
|
| 54 |
-
4. Interprets results
|
| 55 |
-
5. Returns verdict (SUPPORTED/REFUTED/INCONCLUSIVE)
|
| 56 |
-
"""
|
| 57 |
|
| 58 |
def __init__(
|
| 59 |
self,
|
|
@@ -62,51 +36,11 @@ class AnalysisAgent(BaseAgent): # type: ignore[misc]
|
|
| 62 |
) -> None:
|
| 63 |
super().__init__(
|
| 64 |
name="AnalysisAgent",
|
| 65 |
-
description="Performs statistical analysis
|
| 66 |
)
|
| 67 |
self._evidence_store = evidence_store
|
| 68 |
self._embeddings = embedding_service
|
| 69 |
-
self.
|
| 70 |
-
self._agent: Agent[None, str] | None = None # LLM for code generation
|
| 71 |
-
|
| 72 |
-
def _get_code_executor(self) -> Any:
|
| 73 |
-
"""Lazy initialization of code executor (avoids failing if Modal not configured)."""
|
| 74 |
-
if self._code_executor is None:
|
| 75 |
-
self._code_executor = get_code_executor()
|
| 76 |
-
return self._code_executor
|
| 77 |
-
|
| 78 |
-
def _get_agent(self) -> Agent[None, str]:
|
| 79 |
-
"""Lazy initialization of LLM agent."""
|
| 80 |
-
if self._agent is None:
|
| 81 |
-
self._agent = Agent(
|
| 82 |
-
model=get_model(),
|
| 83 |
-
output_type=str, # Returns code as string
|
| 84 |
-
system_prompt=self._get_system_prompt(),
|
| 85 |
-
)
|
| 86 |
-
return self._agent
|
| 87 |
-
|
| 88 |
-
def _get_system_prompt(self) -> str:
|
| 89 |
-
"""System prompt for code generation."""
|
| 90 |
-
library_versions = get_sandbox_library_prompt()
|
| 91 |
-
return f"""You are a biomedical data scientist specializing in statistical analysis.
|
| 92 |
-
|
| 93 |
-
Your task: Generate Python code to analyze research evidence and test hypotheses.
|
| 94 |
-
|
| 95 |
-
Guidelines:
|
| 96 |
-
1. Use pandas, numpy, scipy.stats for analysis
|
| 97 |
-
2. Generate code that prints clear, interpretable results
|
| 98 |
-
3. Include statistical tests (t-tests, chi-square, meta-analysis, etc.)
|
| 99 |
-
4. Calculate effect sizes and confidence intervals
|
| 100 |
-
5. Print summary statistics and test results
|
| 101 |
-
6. Keep code concise (<50 lines)
|
| 102 |
-
7. Set a variable called 'result' with final verdict
|
| 103 |
-
|
| 104 |
-
Available libraries:
|
| 105 |
-
{library_versions}
|
| 106 |
-
|
| 107 |
-
Output format:
|
| 108 |
-
Return ONLY executable Python code, no explanations or markdown.
|
| 109 |
-
"""
|
| 110 |
|
| 111 |
async def run(
|
| 112 |
self,
|
|
@@ -116,202 +50,43 @@ Return ONLY executable Python code, no explanations or markdown.
|
|
| 116 |
**kwargs: Any,
|
| 117 |
) -> AgentRunResponse:
|
| 118 |
"""Analyze evidence and return verdict."""
|
| 119 |
-
# Extract query and hypothesis
|
| 120 |
query = self._extract_query(messages)
|
| 121 |
hypotheses = self._evidence_store.get("hypotheses", [])
|
| 122 |
evidence = self._evidence_store.get("current", [])
|
| 123 |
|
| 124 |
-
if not hypotheses:
|
| 125 |
-
return self._error_response("No hypotheses available. Run HypothesisAgent first.")
|
| 126 |
-
|
| 127 |
-
if not evidence:
|
| 128 |
-
return self._error_response("No evidence available. Run SearchAgent first.")
|
| 129 |
-
|
| 130 |
-
# Get primary hypothesis (guaranteed to exist after check above)
|
| 131 |
-
primary = hypotheses[0]
|
| 132 |
-
|
| 133 |
-
# Retrieve relevant evidence using RAG (if available)
|
| 134 |
-
relevant_evidence = await self._retrieve_relevant_evidence(primary, evidence)
|
| 135 |
-
|
| 136 |
-
# Generate analysis code
|
| 137 |
-
code_prompt = self._create_code_generation_prompt(query, primary, relevant_evidence)
|
| 138 |
-
|
| 139 |
-
try:
|
| 140 |
-
# Generate code using LLM
|
| 141 |
-
agent = self._get_agent()
|
| 142 |
-
code_result = await agent.run(code_prompt)
|
| 143 |
-
generated_code = code_result.output
|
| 144 |
-
|
| 145 |
-
# Execute code in Modal sandbox (run in thread to avoid blocking event loop)
|
| 146 |
-
loop = asyncio.get_running_loop()
|
| 147 |
-
executor = self._get_code_executor()
|
| 148 |
-
execution_result = await loop.run_in_executor(
|
| 149 |
-
None, partial(executor.execute, generated_code, timeout=120)
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
if not execution_result["success"]:
|
| 153 |
-
return self._error_response(f"Code execution failed: {execution_result['error']}")
|
| 154 |
-
|
| 155 |
-
# Interpret results
|
| 156 |
-
analysis_result = await self._interpret_results(
|
| 157 |
-
query, primary, generated_code, execution_result
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
# Store analysis in shared context
|
| 161 |
-
self._evidence_store["analysis"] = analysis_result.model_dump()
|
| 162 |
-
|
| 163 |
-
# Format response
|
| 164 |
-
response_text = self._format_response(analysis_result)
|
| 165 |
-
|
| 166 |
-
return AgentRunResponse(
|
| 167 |
-
messages=[ChatMessage(role=Role.ASSISTANT, text=response_text)],
|
| 168 |
-
response_id=f"analysis-{analysis_result.verdict.lower()}",
|
| 169 |
-
additional_properties={"analysis": analysis_result.model_dump()},
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
except CodeExecutionError as e:
|
| 173 |
-
return self._error_response(f"Analysis failed: {e}")
|
| 174 |
-
except Exception as e:
|
| 175 |
-
return self._error_response(f"Unexpected error: {e}")
|
| 176 |
-
|
| 177 |
-
async def _retrieve_relevant_evidence(
|
| 178 |
-
self, hypothesis: Any, all_evidence: list[Evidence]
|
| 179 |
-
) -> list[Evidence]:
|
| 180 |
-
"""Retrieve most relevant evidence using RAG (if available).
|
| 181 |
-
|
| 182 |
-
TODO: When embeddings service is available (self._embeddings),
|
| 183 |
-
use semantic search to find evidence most relevant to the hypothesis.
|
| 184 |
-
For now, returns top 10 evidence items.
|
| 185 |
-
"""
|
| 186 |
-
# Future: Use self._embeddings for semantic search
|
| 187 |
-
return all_evidence[:10]
|
| 188 |
-
|
| 189 |
-
def _create_code_generation_prompt(
|
| 190 |
-
self, query: str, hypothesis: Any, evidence: list[Evidence]
|
| 191 |
-
) -> str:
|
| 192 |
-
"""Create prompt for code generation."""
|
| 193 |
-
# Extract data from evidence
|
| 194 |
-
evidence_summary = self._summarize_evidence(evidence)
|
| 195 |
-
|
| 196 |
-
prompt = f"""Generate Python code to statistically analyze the following hypothesis:
|
| 197 |
-
|
| 198 |
-
**Original Question**: {query}
|
| 199 |
-
|
| 200 |
-
**Hypothesis**: {hypothesis.drug} → {hypothesis.target} → {hypothesis.pathway} → {hypothesis.effect}
|
| 201 |
-
**Confidence**: {hypothesis.confidence:.0%}
|
| 202 |
-
|
| 203 |
-
**Evidence Summary**:
|
| 204 |
-
{evidence_summary}
|
| 205 |
-
|
| 206 |
-
**Task**:
|
| 207 |
-
1. Parse the evidence data
|
| 208 |
-
2. Perform appropriate statistical tests
|
| 209 |
-
3. Calculate effect sizes and confidence intervals
|
| 210 |
-
4. Determine verdict: SUPPORTED, REFUTED, or INCONCLUSIVE
|
| 211 |
-
5. Set result variable to verdict string
|
| 212 |
-
|
| 213 |
-
Generate executable Python code only (no markdown, no explanations).
|
| 214 |
-
"""
|
| 215 |
-
return prompt
|
| 216 |
-
|
| 217 |
-
def _summarize_evidence(self, evidence: list[Evidence]) -> str:
|
| 218 |
-
"""Summarize evidence for code generation prompt."""
|
| 219 |
if not evidence:
|
| 220 |
-
return "No evidence available."
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
# Extract verdict from output using robust word-boundary matching
|
| 241 |
-
stdout = execution_result["stdout"]
|
| 242 |
-
stdout_upper = stdout.upper()
|
| 243 |
-
verdict = "INCONCLUSIVE" # Default
|
| 244 |
-
|
| 245 |
-
# Avoid false positives like "NOT SUPPORTED" or "UNSUPPORTED"
|
| 246 |
-
if re.search(r"\bSUPPORTED\b", stdout_upper) and not re.search(
|
| 247 |
-
r"\b(?:NOT|UN)SUPPORTED\b", stdout_upper
|
| 248 |
-
):
|
| 249 |
-
verdict = "SUPPORTED"
|
| 250 |
-
elif re.search(r"\bREFUTED\b", stdout_upper):
|
| 251 |
-
verdict = "REFUTED"
|
| 252 |
-
elif re.search(r"\bINCONCLUSIVE\b", stdout_upper):
|
| 253 |
-
verdict = "INCONCLUSIVE"
|
| 254 |
-
|
| 255 |
-
# Parse key findings from output
|
| 256 |
-
key_findings = self._extract_findings(stdout)
|
| 257 |
-
|
| 258 |
-
# Calculate confidence based on statistical significance
|
| 259 |
-
confidence = self._calculate_confidence(stdout)
|
| 260 |
-
|
| 261 |
-
return AnalysisResult(
|
| 262 |
-
verdict=verdict,
|
| 263 |
-
confidence=confidence,
|
| 264 |
-
statistical_evidence=stdout.strip(),
|
| 265 |
-
code_generated=code,
|
| 266 |
-
execution_output=stdout,
|
| 267 |
-
key_findings=key_findings,
|
| 268 |
-
limitations=[
|
| 269 |
-
"Analysis based on summary data only",
|
| 270 |
-
"Limited to available evidence",
|
| 271 |
-
"Statistical tests assume data independence",
|
| 272 |
-
],
|
| 273 |
)
|
| 274 |
|
| 275 |
-
|
| 276 |
-
""
|
| 277 |
-
findings = []
|
| 278 |
-
|
| 279 |
-
# Look for common statistical patterns
|
| 280 |
-
lines = output.split("\n")
|
| 281 |
-
for line in lines:
|
| 282 |
-
line_lower = line.lower()
|
| 283 |
-
if any(
|
| 284 |
-
keyword in line_lower
|
| 285 |
-
for keyword in ["p-value", "significant", "effect size", "correlation", "mean"]
|
| 286 |
-
):
|
| 287 |
-
findings.append(line.strip())
|
| 288 |
-
|
| 289 |
-
return findings[:5] # Top 5 findings
|
| 290 |
-
|
| 291 |
-
def _calculate_confidence(self, output: str) -> float:
|
| 292 |
-
"""Calculate confidence based on statistical results."""
|
| 293 |
-
# Look for p-values
|
| 294 |
-
import re
|
| 295 |
-
|
| 296 |
-
p_values = re.findall(r"p[-\s]?value[:\s]+(\d+\.?\d*)", output.lower())
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
min_p = min(float(p) for p in p_values)
|
| 301 |
-
# Higher confidence for lower p-values
|
| 302 |
-
if min_p < 0.001:
|
| 303 |
-
return 0.95
|
| 304 |
-
elif min_p < 0.01:
|
| 305 |
-
return 0.90
|
| 306 |
-
elif min_p < 0.05:
|
| 307 |
-
return 0.80
|
| 308 |
-
else:
|
| 309 |
-
return 0.60
|
| 310 |
-
except ValueError:
|
| 311 |
-
pass
|
| 312 |
|
| 313 |
-
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
def _format_response(self, result: AnalysisResult) -> str:
|
| 317 |
"""Format analysis result as markdown."""
|
|
@@ -321,7 +96,6 @@ Generate executable Python code only (no markdown, no explanations).
|
|
| 321 |
f"**Confidence**: {result.confidence:.0%}\n",
|
| 322 |
"### Key Findings",
|
| 323 |
]
|
| 324 |
-
|
| 325 |
for finding in result.key_findings:
|
| 326 |
lines.append(f"- {finding}")
|
| 327 |
|
|
@@ -331,28 +105,20 @@ Generate executable Python code only (no markdown, no explanations).
|
|
| 331 |
"```",
|
| 332 |
result.statistical_evidence,
|
| 333 |
"```",
|
| 334 |
-
"\n### Generated Code",
|
| 335 |
-
"```python",
|
| 336 |
-
result.code_generated,
|
| 337 |
-
"```",
|
| 338 |
-
"\n### Limitations",
|
| 339 |
]
|
| 340 |
)
|
| 341 |
-
|
| 342 |
-
for limitation in result.limitations:
|
| 343 |
-
lines.append(f"- {limitation}")
|
| 344 |
-
|
| 345 |
return "\n".join(lines)
|
| 346 |
|
| 347 |
def _error_response(self, message: str) -> AgentRunResponse:
|
| 348 |
"""Create error response."""
|
| 349 |
return AgentRunResponse(
|
| 350 |
-
messages=[ChatMessage(role=Role.ASSISTANT, text=f"
|
| 351 |
response_id="analysis-error",
|
| 352 |
)
|
| 353 |
|
| 354 |
def _extract_query(
|
| 355 |
-
self,
|
|
|
|
| 356 |
) -> str:
|
| 357 |
"""Extract query from messages."""
|
| 358 |
if isinstance(messages, str):
|
|
|
|
| 1 |
+
"""Analysis agent for statistical analysis using Modal code execution.
|
| 2 |
+
|
| 3 |
+
This agent wraps StatisticalAnalyzer for use in magentic multi-agent mode.
|
| 4 |
+
The core logic is in src/services/statistical_analyzer.py to avoid
|
| 5 |
+
coupling agent_framework to the simple orchestrator.
|
| 6 |
+
"""
|
| 7 |
|
|
|
|
| 8 |
from collections.abc import AsyncIterable
|
|
|
|
| 9 |
from typing import TYPE_CHECKING, Any
|
| 10 |
|
| 11 |
from agent_framework import (
|
|
|
|
| 16 |
ChatMessage,
|
| 17 |
Role,
|
| 18 |
)
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
from src.services.statistical_analyzer import (
|
| 21 |
+
AnalysisResult,
|
| 22 |
+
get_statistical_analyzer,
|
|
|
|
|
|
|
| 23 |
)
|
|
|
|
| 24 |
|
| 25 |
if TYPE_CHECKING:
|
| 26 |
from src.services.embeddings import EmbeddingService
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
class AnalysisAgent(BaseAgent): # type: ignore[misc]
|
| 30 |
+
"""Wraps StatisticalAnalyzer for magentic multi-agent mode."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def __init__(
|
| 33 |
self,
|
|
|
|
| 36 |
) -> None:
|
| 37 |
super().__init__(
|
| 38 |
name="AnalysisAgent",
|
| 39 |
+
description="Performs statistical analysis using Modal sandbox",
|
| 40 |
)
|
| 41 |
self._evidence_store = evidence_store
|
| 42 |
self._embeddings = embedding_service
|
| 43 |
+
self._analyzer = get_statistical_analyzer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
async def run(
|
| 46 |
self,
|
|
|
|
| 50 |
**kwargs: Any,
|
| 51 |
) -> AgentRunResponse:
|
| 52 |
"""Analyze evidence and return verdict."""
|
|
|
|
| 53 |
query = self._extract_query(messages)
|
| 54 |
hypotheses = self._evidence_store.get("hypotheses", [])
|
| 55 |
evidence = self._evidence_store.get("current", [])
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if not evidence:
|
| 58 |
+
return self._error_response("No evidence available.")
|
| 59 |
+
|
| 60 |
+
# Get primary hypothesis if available
|
| 61 |
+
hypothesis_dict = None
|
| 62 |
+
if hypotheses:
|
| 63 |
+
h = hypotheses[0]
|
| 64 |
+
hypothesis_dict = {
|
| 65 |
+
"drug": getattr(h, "drug", "Unknown"),
|
| 66 |
+
"target": getattr(h, "target", "?"),
|
| 67 |
+
"pathway": getattr(h, "pathway", "?"),
|
| 68 |
+
"effect": getattr(h, "effect", "?"),
|
| 69 |
+
"confidence": getattr(h, "confidence", 0.5),
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Delegate to StatisticalAnalyzer
|
| 73 |
+
result = await self._analyzer.analyze(
|
| 74 |
+
query=query,
|
| 75 |
+
evidence=evidence,
|
| 76 |
+
hypothesis=hypothesis_dict,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
)
|
| 78 |
|
| 79 |
+
# Store in shared context
|
| 80 |
+
self._evidence_store["analysis"] = result.model_dump()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
# Format response
|
| 83 |
+
response_text = self._format_response(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
return AgentRunResponse(
|
| 86 |
+
messages=[ChatMessage(role=Role.ASSISTANT, text=response_text)],
|
| 87 |
+
response_id=f"analysis-{result.verdict.lower()}",
|
| 88 |
+
additional_properties={"analysis": result.model_dump()},
|
| 89 |
+
)
|
| 90 |
|
| 91 |
def _format_response(self, result: AnalysisResult) -> str:
|
| 92 |
"""Format analysis result as markdown."""
|
|
|
|
| 96 |
f"**Confidence**: {result.confidence:.0%}\n",
|
| 97 |
"### Key Findings",
|
| 98 |
]
|
|
|
|
| 99 |
for finding in result.key_findings:
|
| 100 |
lines.append(f"- {finding}")
|
| 101 |
|
|
|
|
| 105 |
"```",
|
| 106 |
result.statistical_evidence,
|
| 107 |
"```",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
]
|
| 109 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
return "\n".join(lines)
|
| 111 |
|
| 112 |
def _error_response(self, message: str) -> AgentRunResponse:
|
| 113 |
"""Create error response."""
|
| 114 |
return AgentRunResponse(
|
| 115 |
+
messages=[ChatMessage(role=Role.ASSISTANT, text=f"**Error**: {message}")],
|
| 116 |
response_id="analysis-error",
|
| 117 |
)
|
| 118 |
|
| 119 |
def _extract_query(
|
| 120 |
+
self,
|
| 121 |
+
messages: str | ChatMessage | list[str] | list[ChatMessage] | None,
|
| 122 |
) -> str:
|
| 123 |
"""Extract query from messages."""
|
| 124 |
if isinstance(messages, str):
|
src/app.py
CHANGED
|
@@ -8,6 +8,7 @@ import gradio as gr
|
|
| 8 |
|
| 9 |
from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
|
| 10 |
from src.mcp_tools import (
|
|
|
|
| 11 |
search_all_sources,
|
| 12 |
search_biorxiv,
|
| 13 |
search_clinical_trials,
|
|
@@ -211,6 +212,22 @@ def create_demo() -> Any:
|
|
| 211 |
api_name="search_all",
|
| 212 |
)
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
gr.Markdown("""
|
| 215 |
---
|
| 216 |
**Note**: This is a research tool and should not be used for medical decisions.
|
|
|
|
| 8 |
|
| 9 |
from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
|
| 10 |
from src.mcp_tools import (
|
| 11 |
+
analyze_hypothesis,
|
| 12 |
search_all_sources,
|
| 13 |
search_biorxiv,
|
| 14 |
search_clinical_trials,
|
|
|
|
| 212 |
api_name="search_all",
|
| 213 |
)
|
| 214 |
|
| 215 |
+
with gr.Tab("Analyze Hypothesis"):
|
| 216 |
+
gr.Interface(
|
| 217 |
+
fn=analyze_hypothesis,
|
| 218 |
+
inputs=[
|
| 219 |
+
gr.Textbox(label="Drug", placeholder="metformin"),
|
| 220 |
+
gr.Textbox(label="Condition", placeholder="Alzheimer's disease"),
|
| 221 |
+
gr.Textbox(
|
| 222 |
+
label="Evidence Summary",
|
| 223 |
+
placeholder="Studies show metformin reduces tau phosphorylation...",
|
| 224 |
+
lines=5,
|
| 225 |
+
),
|
| 226 |
+
],
|
| 227 |
+
outputs=gr.Markdown(label="Analysis Result"),
|
| 228 |
+
api_name="analyze_hypothesis",
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
gr.Markdown("""
|
| 232 |
---
|
| 233 |
**Note**: This is a research tool and should not be used for medical decisions.
|
src/mcp_tools.py
CHANGED
|
@@ -154,3 +154,72 @@ async def search_all_sources(query: str, max_per_source: int = 5) -> str:
|
|
| 154 |
formatted.append(f"## Preprints\n*Error: {biorxiv_results}*\n")
|
| 155 |
|
| 156 |
return "\n---\n".join(formatted)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
formatted.append(f"## Preprints\n*Error: {biorxiv_results}*\n")
|
| 155 |
|
| 156 |
return "\n---\n".join(formatted)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
async def analyze_hypothesis(
|
| 160 |
+
drug: str,
|
| 161 |
+
condition: str,
|
| 162 |
+
evidence_summary: str,
|
| 163 |
+
) -> str:
|
| 164 |
+
"""Perform statistical analysis of drug repurposing hypothesis using Modal.
|
| 165 |
+
|
| 166 |
+
Executes AI-generated Python code in a secure Modal sandbox to analyze
|
| 167 |
+
the statistical evidence for a drug repurposing hypothesis.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
drug: The drug being evaluated (e.g., "metformin")
|
| 171 |
+
condition: The target condition (e.g., "Alzheimer's disease")
|
| 172 |
+
evidence_summary: Summary of evidence to analyze
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
Analysis result with verdict (SUPPORTED/REFUTED/INCONCLUSIVE) and statistics
|
| 176 |
+
"""
|
| 177 |
+
from src.services.statistical_analyzer import get_statistical_analyzer
|
| 178 |
+
from src.utils.config import settings
|
| 179 |
+
from src.utils.models import Citation, Evidence
|
| 180 |
+
|
| 181 |
+
if not settings.modal_available:
|
| 182 |
+
return "Error: Modal credentials not configured. Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET."
|
| 183 |
+
|
| 184 |
+
# Create evidence from summary
|
| 185 |
+
evidence = [
|
| 186 |
+
Evidence(
|
| 187 |
+
content=evidence_summary,
|
| 188 |
+
citation=Citation(
|
| 189 |
+
source="pubmed",
|
| 190 |
+
title=f"Evidence for {drug} in {condition}",
|
| 191 |
+
url="https://example.com",
|
| 192 |
+
date="2024-01-01",
|
| 193 |
+
authors=["User Provided"],
|
| 194 |
+
),
|
| 195 |
+
relevance=0.9,
|
| 196 |
+
)
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
analyzer = get_statistical_analyzer()
|
| 200 |
+
result = await analyzer.analyze(
|
| 201 |
+
query=f"Can {drug} treat {condition}?",
|
| 202 |
+
evidence=evidence,
|
| 203 |
+
hypothesis={"drug": drug, "target": "unknown", "pathway": "unknown", "effect": condition},
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
return f"""## Statistical Analysis: {drug} for {condition}
|
| 207 |
+
|
| 208 |
+
### Verdict: **{result.verdict}**
|
| 209 |
+
**Confidence**: {result.confidence:.0%}
|
| 210 |
+
|
| 211 |
+
### Key Findings
|
| 212 |
+
{chr(10).join(f"- {f}" for f in result.key_findings) or "- No specific findings extracted"}
|
| 213 |
+
|
| 214 |
+
### Execution Output
|
| 215 |
+
```
|
| 216 |
+
{result.execution_output}
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
### Generated Code
|
| 220 |
+
```python
|
| 221 |
+
{result.code_generated}
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
**Executed in Modal Sandbox** - Isolated, secure, reproducible.
|
| 225 |
+
"""
|
src/orchestrator.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Any, Protocol
|
|
| 6 |
|
| 7 |
import structlog
|
| 8 |
|
|
|
|
| 9 |
from src.utils.models import (
|
| 10 |
AgentEvent,
|
| 11 |
Evidence,
|
|
@@ -41,6 +42,7 @@ class Orchestrator:
|
|
| 41 |
search_handler: SearchHandlerProtocol,
|
| 42 |
judge_handler: JudgeHandlerProtocol,
|
| 43 |
config: OrchestratorConfig | None = None,
|
|
|
|
| 44 |
):
|
| 45 |
"""
|
| 46 |
Initialize the orchestrator.
|
|
@@ -49,11 +51,68 @@ class Orchestrator:
|
|
| 49 |
search_handler: Handler for executing searches
|
| 50 |
judge_handler: Handler for assessing evidence
|
| 51 |
config: Optional configuration (uses defaults if not provided)
|
|
|
|
| 52 |
"""
|
| 53 |
self.search = search_handler
|
| 54 |
self.judge = judge_handler
|
| 55 |
self.config = config or OrchestratorConfig()
|
| 56 |
self.history: list[dict[str, Any]] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
async def run(self, query: str) -> AsyncGenerator[AgentEvent, None]:
|
| 59 |
"""
|
|
@@ -176,6 +235,10 @@ class Orchestrator:
|
|
| 176 |
|
| 177 |
# === DECISION PHASE ===
|
| 178 |
if assessment.sufficient and assessment.recommendation == "synthesize":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
yield AgentEvent(
|
| 180 |
type="synthesizing",
|
| 181 |
message="Evidence sufficient! Preparing synthesis...",
|
|
|
|
| 6 |
|
| 7 |
import structlog
|
| 8 |
|
| 9 |
+
from src.utils.config import settings
|
| 10 |
from src.utils.models import (
|
| 11 |
AgentEvent,
|
| 12 |
Evidence,
|
|
|
|
| 42 |
search_handler: SearchHandlerProtocol,
|
| 43 |
judge_handler: JudgeHandlerProtocol,
|
| 44 |
config: OrchestratorConfig | None = None,
|
| 45 |
+
enable_analysis: bool = False,
|
| 46 |
):
|
| 47 |
"""
|
| 48 |
Initialize the orchestrator.
|
|
|
|
| 51 |
search_handler: Handler for executing searches
|
| 52 |
judge_handler: Handler for assessing evidence
|
| 53 |
config: Optional configuration (uses defaults if not provided)
|
| 54 |
+
enable_analysis: Whether to perform statistical analysis (if Modal available)
|
| 55 |
"""
|
| 56 |
self.search = search_handler
|
| 57 |
self.judge = judge_handler
|
| 58 |
self.config = config or OrchestratorConfig()
|
| 59 |
self.history: list[dict[str, Any]] = []
|
| 60 |
+
self._enable_analysis = enable_analysis and settings.modal_available
|
| 61 |
+
|
| 62 |
+
# Lazy-load analysis (NO agent_framework dependency!)
|
| 63 |
+
self._analyzer: Any = None
|
| 64 |
+
|
| 65 |
+
def _get_analyzer(self) -> Any:
|
| 66 |
+
"""Lazy initialization of StatisticalAnalyzer.
|
| 67 |
+
|
| 68 |
+
Note: This imports from src.services, NOT src.agents,
|
| 69 |
+
so it works without the magentic optional dependency.
|
| 70 |
+
"""
|
| 71 |
+
if self._analyzer is None:
|
| 72 |
+
from src.services.statistical_analyzer import get_statistical_analyzer
|
| 73 |
+
|
| 74 |
+
self._analyzer = get_statistical_analyzer()
|
| 75 |
+
return self._analyzer
|
| 76 |
+
|
| 77 |
+
async def _run_analysis_phase(
|
| 78 |
+
self, query: str, evidence: list[Evidence], iteration: int
|
| 79 |
+
) -> AsyncGenerator[AgentEvent, None]:
|
| 80 |
+
"""Run the optional analysis phase."""
|
| 81 |
+
if not self._enable_analysis:
|
| 82 |
+
return
|
| 83 |
+
|
| 84 |
+
yield AgentEvent(
|
| 85 |
+
type="analyzing",
|
| 86 |
+
message="Running statistical analysis in Modal sandbox...",
|
| 87 |
+
data={},
|
| 88 |
+
iteration=iteration,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
analyzer = self._get_analyzer()
|
| 93 |
+
|
| 94 |
+
# Run Modal analysis (no agent_framework needed!)
|
| 95 |
+
analysis_result = await analyzer.analyze(
|
| 96 |
+
query=query,
|
| 97 |
+
evidence=evidence,
|
| 98 |
+
hypothesis=None, # Could add hypothesis generation later
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
yield AgentEvent(
|
| 102 |
+
type="analysis_complete",
|
| 103 |
+
message=f"Analysis verdict: {analysis_result.verdict}",
|
| 104 |
+
data=analysis_result.model_dump(),
|
| 105 |
+
iteration=iteration,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error("Modal analysis failed", error=str(e))
|
| 110 |
+
yield AgentEvent(
|
| 111 |
+
type="error",
|
| 112 |
+
message=f"Modal analysis failed: {e}",
|
| 113 |
+
data={"error": str(e)},
|
| 114 |
+
iteration=iteration,
|
| 115 |
+
)
|
| 116 |
|
| 117 |
async def run(self, query: str) -> AsyncGenerator[AgentEvent, None]:
|
| 118 |
"""
|
|
|
|
| 235 |
|
| 236 |
# === DECISION PHASE ===
|
| 237 |
if assessment.sufficient and assessment.recommendation == "synthesize":
|
| 238 |
+
# Optional Analysis Phase
|
| 239 |
+
async for event in self._run_analysis_phase(query, all_evidence, iteration):
|
| 240 |
+
yield event
|
| 241 |
+
|
| 242 |
yield AgentEvent(
|
| 243 |
type="synthesizing",
|
| 244 |
message="Evidence sufficient! Preparing synthesis...",
|
src/services/statistical_analyzer.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Statistical analysis service using Modal code execution.
|
| 2 |
+
|
| 3 |
+
This module provides Modal-based statistical analysis WITHOUT depending on
|
| 4 |
+
agent_framework. This allows it to be used in the simple orchestrator mode
|
| 5 |
+
without requiring the magentic optional dependency.
|
| 6 |
+
|
| 7 |
+
The AnalysisAgent (in src/agents/) wraps this service for magentic mode.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import asyncio
|
| 11 |
+
import re
|
| 12 |
+
from functools import lru_cache, partial
|
| 13 |
+
from typing import Any, Literal
|
| 14 |
+
|
| 15 |
+
# Type alias for verdict values
|
| 16 |
+
VerdictType = Literal["SUPPORTED", "REFUTED", "INCONCLUSIVE"]
|
| 17 |
+
|
| 18 |
+
from pydantic import BaseModel, Field
|
| 19 |
+
from pydantic_ai import Agent
|
| 20 |
+
|
| 21 |
+
from src.agent_factory.judges import get_model
|
| 22 |
+
from src.tools.code_execution import (
|
| 23 |
+
CodeExecutionError,
|
| 24 |
+
get_code_executor,
|
| 25 |
+
get_sandbox_library_prompt,
|
| 26 |
+
)
|
| 27 |
+
from src.utils.models import Evidence
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class AnalysisResult(BaseModel):
|
| 31 |
+
"""Result of statistical analysis."""
|
| 32 |
+
|
| 33 |
+
verdict: VerdictType = Field(
|
| 34 |
+
description="SUPPORTED, REFUTED, or INCONCLUSIVE",
|
| 35 |
+
)
|
| 36 |
+
confidence: float = Field(ge=0.0, le=1.0, description="Confidence in verdict (0-1)")
|
| 37 |
+
statistical_evidence: str = Field(
|
| 38 |
+
description="Summary of statistical findings from code execution"
|
| 39 |
+
)
|
| 40 |
+
code_generated: str = Field(description="Python code that was executed")
|
| 41 |
+
execution_output: str = Field(description="Output from code execution")
|
| 42 |
+
key_findings: list[str] = Field(default_factory=list, description="Key takeaways")
|
| 43 |
+
limitations: list[str] = Field(default_factory=list, description="Limitations")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class StatisticalAnalyzer:
|
| 47 |
+
"""Performs statistical analysis using Modal code execution.
|
| 48 |
+
|
| 49 |
+
This service:
|
| 50 |
+
1. Generates Python code for statistical analysis using LLM
|
| 51 |
+
2. Executes code in Modal sandbox
|
| 52 |
+
3. Interprets results
|
| 53 |
+
4. Returns verdict (SUPPORTED/REFUTED/INCONCLUSIVE)
|
| 54 |
+
|
| 55 |
+
Note: This class has NO agent_framework dependency, making it safe
|
| 56 |
+
to use in the simple orchestrator without the magentic extra.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(self) -> None:
|
| 60 |
+
"""Initialize the analyzer."""
|
| 61 |
+
self._code_executor: Any = None
|
| 62 |
+
self._agent: Agent[None, str] | None = None
|
| 63 |
+
|
| 64 |
+
def _get_code_executor(self) -> Any:
|
| 65 |
+
"""Lazy initialization of code executor."""
|
| 66 |
+
if self._code_executor is None:
|
| 67 |
+
self._code_executor = get_code_executor()
|
| 68 |
+
return self._code_executor
|
| 69 |
+
|
| 70 |
+
def _get_agent(self) -> Agent[None, str]:
|
| 71 |
+
"""Lazy initialization of LLM agent for code generation."""
|
| 72 |
+
if self._agent is None:
|
| 73 |
+
library_versions = get_sandbox_library_prompt()
|
| 74 |
+
self._agent = Agent(
|
| 75 |
+
model=get_model(),
|
| 76 |
+
output_type=str,
|
| 77 |
+
system_prompt=f"""You are a biomedical data scientist.
|
| 78 |
+
|
| 79 |
+
Generate Python code to analyze research evidence and test hypotheses.
|
| 80 |
+
|
| 81 |
+
Guidelines:
|
| 82 |
+
1. Use pandas, numpy, scipy.stats for analysis
|
| 83 |
+
2. Print clear, interpretable results
|
| 84 |
+
3. Include statistical tests (t-tests, chi-square, etc.)
|
| 85 |
+
4. Calculate effect sizes and confidence intervals
|
| 86 |
+
5. Keep code concise (<50 lines)
|
| 87 |
+
6. Set 'result' variable to SUPPORTED, REFUTED, or INCONCLUSIVE
|
| 88 |
+
|
| 89 |
+
Available libraries:
|
| 90 |
+
{library_versions}
|
| 91 |
+
|
| 92 |
+
Output format: Return ONLY executable Python code, no explanations.""",
|
| 93 |
+
)
|
| 94 |
+
return self._agent
|
| 95 |
+
|
| 96 |
+
async def analyze(
|
| 97 |
+
self,
|
| 98 |
+
query: str,
|
| 99 |
+
evidence: list[Evidence],
|
| 100 |
+
hypothesis: dict[str, Any] | None = None,
|
| 101 |
+
) -> AnalysisResult:
|
| 102 |
+
"""Run statistical analysis on evidence.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
query: The research question
|
| 106 |
+
evidence: List of Evidence objects to analyze
|
| 107 |
+
hypothesis: Optional hypothesis dict with drug, target, pathway, effect
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
AnalysisResult with verdict and statistics
|
| 111 |
+
"""
|
| 112 |
+
# Build analysis prompt (method handles slicing internally)
|
| 113 |
+
evidence_summary = self._summarize_evidence(evidence)
|
| 114 |
+
hypothesis_text = ""
|
| 115 |
+
if hypothesis:
|
| 116 |
+
hypothesis_text = (
|
| 117 |
+
f"\nHypothesis: {hypothesis.get('drug', 'Unknown')} → "
|
| 118 |
+
f"{hypothesis.get('target', '?')} → "
|
| 119 |
+
f"{hypothesis.get('pathway', '?')} → "
|
| 120 |
+
f"{hypothesis.get('effect', '?')}\n"
|
| 121 |
+
f"Confidence: {hypothesis.get('confidence', 0.5):.0%}\n"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
prompt = f"""Generate Python code to statistically analyze:
|
| 125 |
+
|
| 126 |
+
**Research Question**: {query}
|
| 127 |
+
{hypothesis_text}
|
| 128 |
+
|
| 129 |
+
**Evidence Summary**:
|
| 130 |
+
{evidence_summary}
|
| 131 |
+
|
| 132 |
+
Generate executable Python code to analyze this evidence."""
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
# Generate code
|
| 136 |
+
agent = self._get_agent()
|
| 137 |
+
code_result = await agent.run(prompt)
|
| 138 |
+
generated_code = code_result.output
|
| 139 |
+
|
| 140 |
+
# Execute in Modal sandbox
|
| 141 |
+
loop = asyncio.get_running_loop()
|
| 142 |
+
executor = self._get_code_executor()
|
| 143 |
+
execution = await loop.run_in_executor(
|
| 144 |
+
None, partial(executor.execute, generated_code, timeout=120)
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if not execution["success"]:
|
| 148 |
+
return AnalysisResult(
|
| 149 |
+
verdict="INCONCLUSIVE",
|
| 150 |
+
confidence=0.0,
|
| 151 |
+
statistical_evidence=(
|
| 152 |
+
f"Execution failed: {execution.get('error', 'Unknown error')}"
|
| 153 |
+
),
|
| 154 |
+
code_generated=generated_code,
|
| 155 |
+
execution_output=execution.get("stderr", ""),
|
| 156 |
+
key_findings=[],
|
| 157 |
+
limitations=["Code execution failed"],
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Interpret results
|
| 161 |
+
return self._interpret_results(generated_code, execution)
|
| 162 |
+
|
| 163 |
+
except CodeExecutionError as e:
|
| 164 |
+
return AnalysisResult(
|
| 165 |
+
verdict="INCONCLUSIVE",
|
| 166 |
+
confidence=0.0,
|
| 167 |
+
statistical_evidence=str(e),
|
| 168 |
+
code_generated="",
|
| 169 |
+
execution_output="",
|
| 170 |
+
key_findings=[],
|
| 171 |
+
limitations=[f"Analysis error: {e}"],
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
def _summarize_evidence(self, evidence: list[Evidence]) -> str:
|
| 175 |
+
"""Summarize evidence for code generation prompt."""
|
| 176 |
+
if not evidence:
|
| 177 |
+
return "No evidence available."
|
| 178 |
+
|
| 179 |
+
lines = []
|
| 180 |
+
for i, ev in enumerate(evidence[:5], 1):
|
| 181 |
+
content = ev.content
|
| 182 |
+
truncated = content[:200] + ("..." if len(content) > 200 else "")
|
| 183 |
+
lines.append(f"{i}. {truncated}")
|
| 184 |
+
lines.append(f" Source: {ev.citation.title}")
|
| 185 |
+
lines.append(f" Relevance: {ev.relevance:.0%}\n")
|
| 186 |
+
|
| 187 |
+
return "\n".join(lines)
|
| 188 |
+
|
| 189 |
+
def _interpret_results(
|
| 190 |
+
self,
|
| 191 |
+
code: str,
|
| 192 |
+
execution: dict[str, Any],
|
| 193 |
+
) -> AnalysisResult:
|
| 194 |
+
"""Interpret code execution results."""
|
| 195 |
+
stdout = execution["stdout"]
|
| 196 |
+
stdout_upper = stdout.upper()
|
| 197 |
+
|
| 198 |
+
# Extract verdict with robust word-boundary matching
|
| 199 |
+
verdict: VerdictType = "INCONCLUSIVE"
|
| 200 |
+
if re.search(r"\bSUPPORTED\b", stdout_upper) and not re.search(
|
| 201 |
+
r"\b(?:NOT|UN)SUPPORTED\b", stdout_upper
|
| 202 |
+
):
|
| 203 |
+
verdict = "SUPPORTED"
|
| 204 |
+
elif re.search(r"\bREFUTED\b", stdout_upper):
|
| 205 |
+
verdict = "REFUTED"
|
| 206 |
+
|
| 207 |
+
# Extract key findings
|
| 208 |
+
key_findings = []
|
| 209 |
+
for line in stdout.split("\n"):
|
| 210 |
+
line_lower = line.lower()
|
| 211 |
+
if any(kw in line_lower for kw in ["p-value", "significant", "effect", "mean"]):
|
| 212 |
+
key_findings.append(line.strip())
|
| 213 |
+
|
| 214 |
+
# Calculate confidence from p-values
|
| 215 |
+
confidence = self._calculate_confidence(stdout)
|
| 216 |
+
|
| 217 |
+
return AnalysisResult(
|
| 218 |
+
verdict=verdict,
|
| 219 |
+
confidence=confidence,
|
| 220 |
+
statistical_evidence=stdout.strip(),
|
| 221 |
+
code_generated=code,
|
| 222 |
+
execution_output=stdout,
|
| 223 |
+
key_findings=key_findings[:5],
|
| 224 |
+
limitations=[
|
| 225 |
+
"Analysis based on summary data only",
|
| 226 |
+
"Limited to available evidence",
|
| 227 |
+
"Statistical tests assume data independence",
|
| 228 |
+
],
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
def _calculate_confidence(self, output: str) -> float:
|
| 232 |
+
"""Calculate confidence based on statistical results."""
|
| 233 |
+
p_values = re.findall(r"p[-\s]?value[:\s]+(\d+\.?\d*)", output.lower())
|
| 234 |
+
|
| 235 |
+
if p_values:
|
| 236 |
+
try:
|
| 237 |
+
min_p = min(float(p) for p in p_values)
|
| 238 |
+
if min_p < 0.001:
|
| 239 |
+
return 0.95
|
| 240 |
+
elif min_p < 0.01:
|
| 241 |
+
return 0.90
|
| 242 |
+
elif min_p < 0.05:
|
| 243 |
+
return 0.80
|
| 244 |
+
else:
|
| 245 |
+
return 0.60
|
| 246 |
+
except ValueError:
|
| 247 |
+
pass
|
| 248 |
+
|
| 249 |
+
return 0.70 # Default
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
@lru_cache(maxsize=1)
|
| 253 |
+
def get_statistical_analyzer() -> StatisticalAnalyzer:
|
| 254 |
+
"""Get or create singleton StatisticalAnalyzer instance (thread-safe via lru_cache)."""
|
| 255 |
+
return StatisticalAnalyzer()
|
src/utils/config.py
CHANGED
|
@@ -56,6 +56,20 @@ class Settings(BaseSettings):
|
|
| 56 |
modal_token_id: str | None = Field(default=None, description="Modal token ID")
|
| 57 |
modal_token_secret: str | None = Field(default=None, description="Modal token secret")
|
| 58 |
chroma_db_path: str = Field(default="./chroma_db", description="ChromaDB storage path")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def get_api_key(self) -> str:
|
| 61 |
"""Get the API key for the configured provider."""
|
|
|
|
| 56 |
modal_token_id: str | None = Field(default=None, description="Modal token ID")
|
| 57 |
modal_token_secret: str | None = Field(default=None, description="Modal token secret")
|
| 58 |
chroma_db_path: str = Field(default="./chroma_db", description="ChromaDB storage path")
|
| 59 |
+
enable_modal_analysis: bool = Field(
|
| 60 |
+
default=False,
|
| 61 |
+
description="Opt-in flag to enable Modal analysis. Must also have modal_available=True.",
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def modal_available(self) -> bool:
|
| 66 |
+
"""Check if Modal credentials are configured (credentials check only).
|
| 67 |
+
|
| 68 |
+
Note: This is a credentials check, NOT an opt-in flag.
|
| 69 |
+
Use `enable_modal_analysis` to opt-in, then check `modal_available` for credentials.
|
| 70 |
+
Typical usage: `if settings.enable_modal_analysis and settings.modal_available`
|
| 71 |
+
"""
|
| 72 |
+
return bool(self.modal_token_id and self.modal_token_secret)
|
| 73 |
|
| 74 |
def get_api_key(self) -> str:
|
| 75 |
"""Get the API key for the configured provider."""
|
src/utils/models.py
CHANGED
|
@@ -111,7 +111,9 @@ class AgentEvent(BaseModel):
|
|
| 111 |
"complete",
|
| 112 |
"error",
|
| 113 |
"streaming",
|
| 114 |
-
"hypothesizing",
|
|
|
|
|
|
|
| 115 |
]
|
| 116 |
message: str
|
| 117 |
data: Any = None
|
|
@@ -132,6 +134,8 @@ class AgentEvent(BaseModel):
|
|
| 132 |
"error": "❌",
|
| 133 |
"streaming": "📡",
|
| 134 |
"hypothesizing": "🔬", # NEW
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
icon = icons.get(self.type, "•")
|
| 137 |
return f"{icon} **{self.type.upper()}**: {self.message}"
|
|
|
|
| 111 |
"complete",
|
| 112 |
"error",
|
| 113 |
"streaming",
|
| 114 |
+
"hypothesizing",
|
| 115 |
+
"analyzing", # NEW for Phase 13
|
| 116 |
+
"analysis_complete", # NEW for Phase 13
|
| 117 |
]
|
| 118 |
message: str
|
| 119 |
data: Any = None
|
|
|
|
| 134 |
"error": "❌",
|
| 135 |
"streaming": "📡",
|
| 136 |
"hypothesizing": "🔬", # NEW
|
| 137 |
+
"analyzing": "📊", # NEW
|
| 138 |
+
"analysis_complete": "📈", # NEW
|
| 139 |
}
|
| 140 |
icon = icons.get(self.type, "•")
|
| 141 |
return f"{icon} **{self.type.upper()}**: {self.message}"
|
tests/integration/test_modal.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Integration tests for Modal (requires credentials)."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from src.utils.config import settings
|
| 6 |
+
|
| 7 |
+
# Check if any LLM API key is available
|
| 8 |
+
_llm_available = bool(settings.openai_api_key or settings.anthropic_api_key)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@pytest.mark.integration
|
| 12 |
+
@pytest.mark.skipif(not settings.modal_available, reason="Modal not configured")
|
| 13 |
+
class TestModalIntegration:
|
| 14 |
+
"""Integration tests requiring Modal credentials."""
|
| 15 |
+
|
| 16 |
+
@pytest.mark.asyncio
|
| 17 |
+
async def test_sandbox_executes_code(self) -> None:
|
| 18 |
+
"""Modal sandbox should execute Python code."""
|
| 19 |
+
import asyncio
|
| 20 |
+
from functools import partial
|
| 21 |
+
|
| 22 |
+
from src.tools.code_execution import get_code_executor
|
| 23 |
+
|
| 24 |
+
executor = get_code_executor()
|
| 25 |
+
code = "import pandas as pd; print(pd.DataFrame({'a': [1,2,3]})['a'].sum())"
|
| 26 |
+
|
| 27 |
+
loop = asyncio.get_running_loop()
|
| 28 |
+
result = await loop.run_in_executor(None, partial(executor.execute, code, timeout=30))
|
| 29 |
+
|
| 30 |
+
assert result["success"]
|
| 31 |
+
assert "6" in result["stdout"]
|
| 32 |
+
|
| 33 |
+
@pytest.mark.asyncio
|
| 34 |
+
@pytest.mark.skipif(not _llm_available, reason="LLM API key not configured")
|
| 35 |
+
async def test_statistical_analyzer_works(self) -> None:
|
| 36 |
+
"""StatisticalAnalyzer should work end-to-end (requires Modal + LLM)."""
|
| 37 |
+
from src.services.statistical_analyzer import get_statistical_analyzer
|
| 38 |
+
from src.utils.models import Citation, Evidence
|
| 39 |
+
|
| 40 |
+
evidence = [
|
| 41 |
+
Evidence(
|
| 42 |
+
content="Drug shows 40% improvement in trial.",
|
| 43 |
+
citation=Citation(
|
| 44 |
+
source="pubmed",
|
| 45 |
+
title="Test",
|
| 46 |
+
url="https://test.com",
|
| 47 |
+
date="2024-01-01",
|
| 48 |
+
authors=["Test"],
|
| 49 |
+
),
|
| 50 |
+
relevance=0.9,
|
| 51 |
+
)
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
analyzer = get_statistical_analyzer()
|
| 55 |
+
result = await analyzer.analyze("test drug efficacy", evidence)
|
| 56 |
+
|
| 57 |
+
assert result.verdict in ["SUPPORTED", "REFUTED", "INCONCLUSIVE"]
|
| 58 |
+
assert 0.0 <= result.confidence <= 1.0
|
tests/unit/services/test_statistical_analyzer.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"Unit tests for StatisticalAnalyzer service."
|
| 2 |
+
|
| 3 |
+
from unittest.mock import AsyncMock, MagicMock, patch
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from src.services.statistical_analyzer import (
|
| 8 |
+
AnalysisResult,
|
| 9 |
+
StatisticalAnalyzer,
|
| 10 |
+
get_statistical_analyzer,
|
| 11 |
+
)
|
| 12 |
+
from src.utils.models import Citation, Evidence
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@pytest.fixture
|
| 16 |
+
def sample_evidence() -> list[Evidence]:
|
| 17 |
+
"""Sample evidence for testing."""
|
| 18 |
+
return [
|
| 19 |
+
Evidence(
|
| 20 |
+
content="Metformin shows effect size of 0.45.",
|
| 21 |
+
citation=Citation(
|
| 22 |
+
source="pubmed",
|
| 23 |
+
title="Metformin Study",
|
| 24 |
+
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 25 |
+
date="2024-01-15",
|
| 26 |
+
authors=["Smith J"],
|
| 27 |
+
),
|
| 28 |
+
relevance=0.9,
|
| 29 |
+
)
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class TestStatisticalAnalyzer:
|
| 34 |
+
"""Tests for StatisticalAnalyzer (no agent_framework dependency)."""
|
| 35 |
+
|
| 36 |
+
def test_no_agent_framework_import(self) -> None:
|
| 37 |
+
"""StatisticalAnalyzer must NOT import agent_framework."""
|
| 38 |
+
import src.services.statistical_analyzer as module
|
| 39 |
+
|
| 40 |
+
# Check module doesn't import agent_framework
|
| 41 |
+
with open(module.__file__) as f:
|
| 42 |
+
source = f.read()
|
| 43 |
+
assert "from agent_framework" not in source
|
| 44 |
+
assert "import agent_framework" not in source
|
| 45 |
+
assert "BaseAgent" not in source
|
| 46 |
+
|
| 47 |
+
@pytest.mark.asyncio
|
| 48 |
+
async def test_analyze_returns_result(self, sample_evidence: list[Evidence]) -> None:
|
| 49 |
+
"""analyze() should return AnalysisResult."""
|
| 50 |
+
analyzer = StatisticalAnalyzer()
|
| 51 |
+
|
| 52 |
+
with (
|
| 53 |
+
patch.object(analyzer, "_get_agent") as mock_agent,
|
| 54 |
+
patch.object(analyzer, "_get_code_executor") as mock_executor,
|
| 55 |
+
):
|
| 56 |
+
# Mock LLM
|
| 57 |
+
mock_agent.return_value.run = AsyncMock(
|
| 58 |
+
return_value=MagicMock(output="print('SUPPORTED')")
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Mock Modal
|
| 62 |
+
mock_executor.return_value.execute.return_value = {
|
| 63 |
+
"stdout": "SUPPORTED\np-value: 0.01",
|
| 64 |
+
"stderr": "",
|
| 65 |
+
"success": True,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
result = await analyzer.analyze("test query", sample_evidence)
|
| 69 |
+
|
| 70 |
+
assert isinstance(result, AnalysisResult)
|
| 71 |
+
assert result.verdict == "SUPPORTED"
|
| 72 |
+
|
| 73 |
+
def test_singleton(self) -> None:
|
| 74 |
+
"""get_statistical_analyzer should return singleton."""
|
| 75 |
+
a1 = get_statistical_analyzer()
|
| 76 |
+
a2 = get_statistical_analyzer()
|
| 77 |
+
assert a1 is a2
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class TestAnalysisResult:
|
| 81 |
+
"""Tests for AnalysisResult model."""
|
| 82 |
+
|
| 83 |
+
def test_verdict_values(self) -> None:
|
| 84 |
+
"""Verdict should be one of the expected values."""
|
| 85 |
+
for verdict in ["SUPPORTED", "REFUTED", "INCONCLUSIVE"]:
|
| 86 |
+
result = AnalysisResult(
|
| 87 |
+
verdict=verdict,
|
| 88 |
+
confidence=0.8,
|
| 89 |
+
statistical_evidence="test",
|
| 90 |
+
code_generated="print('test')",
|
| 91 |
+
execution_output="test",
|
| 92 |
+
)
|
| 93 |
+
assert result.verdict == verdict
|
| 94 |
+
|
| 95 |
+
def test_confidence_bounds(self) -> None:
|
| 96 |
+
"""Confidence must be 0.0-1.0."""
|
| 97 |
+
with pytest.raises(ValueError):
|
| 98 |
+
AnalysisResult(
|
| 99 |
+
verdict="SUPPORTED",
|
| 100 |
+
confidence=1.5, # Invalid
|
| 101 |
+
statistical_evidence="test",
|
| 102 |
+
code_generated="test",
|
| 103 |
+
execution_output="test",
|
| 104 |
+
)
|