File size: 2,915 Bytes
3139749
 
 
 
 
 
 
 
 
 
 
 
 
 
f1e4e5b
 
 
3139749
 
 
 
 
 
f1e4e5b
 
 
 
3139749
f1e4e5b
3139749
 
 
f1e4e5b
3139749
 
 
f1e4e5b
3139749
 
 
 
 
 
 
 
 
 
 
 
f1e4e5b
 
3139749
 
 
f1e4e5b
 
 
 
 
3139749
 
f1e4e5b
3139749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""Citation validation to prevent LLM hallucination.

CRITICAL: Medical research requires accurate citations.
This module validates that all references exist in collected evidence.
"""

import logging
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from src.utils.models import Evidence, ResearchReport

logger = logging.getLogger(__name__)

# Max characters to display for URLs in log messages
_MAX_URL_DISPLAY_LENGTH = 80


def validate_references(report: "ResearchReport", evidence: list["Evidence"]) -> "ResearchReport":
    """Ensure all references actually exist in collected evidence.

    CRITICAL: Prevents LLM hallucination of citations.

    Note:
        This function MUTATES report.references in-place and returns the same
        report object. This is intentional for efficiency.

    Args:
        report: The generated research report (will be mutated)
        evidence: All evidence collected during research

    Returns:
        The same report object with references updated in-place
    """
    # Build set of valid URLs from evidence
    valid_urls = {e.citation.url for e in evidence}
    # Also check titles (case-insensitive, exact match) as fallback
    valid_titles = {e.citation.title.lower() for e in evidence}

    validated_refs = []
    removed_count = 0

    for ref in report.references:
        ref_url = ref.get("url", "")
        ref_title = ref.get("title", "").lower()

        # Check if URL matches collected evidence
        if ref_url in valid_urls:
            validated_refs.append(ref)
        # Fallback: exact title match (case-insensitive)
        elif ref_title and ref_title in valid_titles:
            validated_refs.append(ref)
        else:
            removed_count += 1
            # Truncate URL for display
            if len(ref_url) > _MAX_URL_DISPLAY_LENGTH:
                url_display = ref_url[:_MAX_URL_DISPLAY_LENGTH] + "..."
            else:
                url_display = ref_url
            logger.warning(
                f"Removed hallucinated reference: '{ref.get('title', 'Unknown')}' "
                f"(URL: {url_display})"
            )

    if removed_count > 0:
        logger.info(
            f"Citation validation removed {removed_count} hallucinated references. "
            f"{len(validated_refs)} valid references remain."
        )

    # Update report with validated references
    report.references = validated_refs
    return report


def build_reference_from_evidence(evidence: "Evidence") -> dict[str, str]:
    """Build a properly formatted reference from evidence.

    Use this to ensure references match the original evidence exactly.
    """
    return {
        "title": evidence.citation.title,
        "authors": ", ".join(evidence.citation.authors or ["Unknown"]),
        "source": evidence.citation.source,
        "date": evidence.citation.date or "n.d.",
        "url": evidence.citation.url,
    }