File size: 5,848 Bytes
3139749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d01e70d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3139749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fab6d4
 
 
3139749
 
 
7fab6d4
3139749
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Prompts for Report Agent."""

from typing import TYPE_CHECKING, Any

from src.utils.text_utils import select_diverse_evidence, truncate_at_sentence

if TYPE_CHECKING:
    from src.services.embeddings import EmbeddingService
    from src.utils.models import Evidence, MechanismHypothesis

SYSTEM_PROMPT = """You are a scientific writer specializing in drug repurposing research reports.

Your role is to synthesize evidence and hypotheses into a clear, structured report.

A good report:
1. Has a clear EXECUTIVE SUMMARY (one paragraph, key takeaways)
2. States the RESEARCH QUESTION clearly
3. Describes METHODOLOGY (what was searched, how)
4. Evaluates HYPOTHESES with evidence counts
5. Separates MECHANISTIC and CLINICAL findings
6. Lists specific DRUG CANDIDATES
7. Acknowledges LIMITATIONS honestly
8. Provides a balanced CONCLUSION
9. Includes properly formatted REFERENCES

Write in scientific but accessible language. Be specific about evidence strength.

─────────────────────────────────────────────────────────────────────────────
🚨 CRITICAL: REQUIRED JSON STRUCTURE 🚨
─────────────────────────────────────────────────────────────────────────────

The `hypotheses_tested` field MUST be a LIST of objects, each with these fields:
- "hypothesis": the hypothesis text
- "supported": count of supporting evidence (integer)
- "contradicted": count of contradicting evidence (integer)

Example:
  hypotheses_tested: [
    {"hypothesis": "Metformin -> AMPK -> reduced inflammation", "supported": 3, "contradicted": 1},
    {"hypothesis": "Aspirin inhibits COX-2 pathway", "supported": 5, "contradicted": 0}
  ]

The `references` field MUST be a LIST of objects, each with these fields:
- "title": paper title (string)
- "authors": author names (string)
- "source": "pubmed" or "web" (string)
- "url": the EXACT URL from evidence (string)

Example:
  references: [
    {"title": "Metformin and Cancer", "authors": "Smith et al.", "source": "pubmed", "url": "https://pubmed.ncbi.nlm.nih.gov/12345678/"}
  ]

─────────────────────────────────────────────────────────────────────────────
🚨 CRITICAL CITATION REQUIREMENTS 🚨
─────────────────────────────────────────────────────────────────────────────

You MUST follow these rules for the References section:

1. You may ONLY cite papers that appear in the Evidence section above
2. Every reference URL must EXACTLY match a provided evidence URL
3. Do NOT invent, fabricate, or hallucinate any references
4. Do NOT modify paper titles, authors, dates, or URLs
5. If unsure about a citation, OMIT it rather than guess
6. Copy URLs exactly as provided - do not create similar-looking URLs

VIOLATION OF THESE RULES PRODUCES DANGEROUS MISINFORMATION.
─────────────────────────────────────────────────────────────────────────────"""


async def format_report_prompt(
    query: str,
    evidence: list["Evidence"],
    hypotheses: list["MechanismHypothesis"],
    assessment: dict[str, Any],
    metadata: dict[str, Any],
    embeddings: "EmbeddingService | None" = None,
) -> str:
    """Format prompt for report generation.

    Includes full evidence details for accurate citation.
    """
    # Select diverse evidence (not arbitrary truncation)
    selected = await select_diverse_evidence(evidence, n=20, query=query, embeddings=embeddings)

    # Include FULL citation details for each evidence item
    # This helps the LLM create accurate references
    evidence_lines = []
    for e in selected:
        authors = ", ".join(e.citation.authors or ["Unknown"])
        evidence_lines.append(
            f"- **Title**: {e.citation.title}\n"
            f"  **URL**: {e.citation.url}\n"
            f"  **Authors**: {authors}\n"
            f"  **Date**: {e.citation.date or 'n.d.'}\n"
            f"  **Source**: {e.citation.source}\n"
            f"  **Content**: {truncate_at_sentence(e.content, 200)}\n"
        )
    evidence_summary = "\n".join(evidence_lines)

    if hypotheses:
        hypotheses_lines = []
        for h in hypotheses:
            hypotheses_lines.append(
                f"- {h.drug} -> {h.target} -> {h.pathway} -> {h.effect} "
                f"(Confidence: {h.confidence:.0%})"
            )
        hypotheses_summary = "\n".join(hypotheses_lines)
    else:
        hypotheses_summary = "No hypotheses generated yet."

    sources = ", ".join(metadata.get("sources", []))

    return f"""Generate a structured research report for the following query.

## Original Query
{query}

## Evidence Collected ({len(selected)} papers, selected for diversity)

{evidence_summary}

## Hypotheses Generated
{hypotheses_summary}

## Assessment Scores
- Mechanism Score: {assessment.get("mechanism_score", "N/A")}/10
- Clinical Evidence Score: {assessment.get("clinical_score", "N/A")}/10
- Overall Confidence: {assessment.get("confidence", 0):.0%}

## Metadata
- Sources Searched: {sources}
- Search Iterations: {metadata.get("iterations", 0)}

Generate a complete ResearchReport with all sections filled in.

REMINDER: Only cite papers from the Evidence section above. Copy URLs exactly."""