Spaces:
Sleeping
Sleeping
Commit
Β·
ad56f36
1
Parent(s):
7e3f286
Add system
Browse files- Dockerfile +10 -0
- README.md +6 -7
- app.py +342 -0
- requirements.txt +25 -0
- system/.DS_Store +0 -0
- system/__init__.py +0 -0
- system/__pycache__/augmented_searching.cpython-312.pyc +0 -0
- system/__pycache__/ee.cpython-312.pyc +0 -0
- system/__pycache__/generate_output.cpython-312.pyc +0 -0
- system/__pycache__/hero_pipeline.cpython-312.pyc +0 -0
- system/__pycache__/html2lines.cpython-312.pyc +0 -0
- system/__pycache__/initial_searching.cpython-312.pyc +0 -0
- system/__pycache__/process_time.cpython-312.pyc +0 -0
- system/__pycache__/scraper.cpython-312.pyc +0 -0
- system/augmented_searching.py +98 -0
- system/baseline/hyde_fc_generation_optimized.py +163 -0
- system/baseline/question_generation_optimized.py +244 -0
- system/baseline/reranking_optimized.py +230 -0
- system/baseline/retrieval_optimized.py +244 -0
- system/baseline/train.json +0 -0
- system/date_verifier.py +58 -0
- system/ee.py +104 -0
- system/generate_output.py +75 -0
- system/hero_pipeline.py +150 -0
- system/html2lines.py +105 -0
- system/initial_searching.py +100 -0
- system/pledge_tracking.py +247 -0
- system/process_time.py +267 -0
- system/scraper.py +95 -0
- test.html +553 -0
Dockerfile
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
COPY . /app
|
| 5 |
+
|
| 6 |
+
RUN pip install --no-cache-dir flask flask-cors pandas openpyxl
|
| 7 |
+
|
| 8 |
+
EXPOSE 7860
|
| 9 |
+
|
| 10 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
---
|
| 2 |
title: Pledge Tracker
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 1 |
---
|
| 2 |
title: Pledge Tracker
|
| 3 |
+
colorFrom: purple
|
| 4 |
+
colorTo: indigo
|
|
|
|
| 5 |
sdk: gradio
|
| 6 |
+
sdk_version: 5.34.0
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
+
license: cc-by-nc-4.0
|
| 10 |
+
short_description: Track and fact-check pledges with supporting evidence.
|
| 11 |
+
---
|
app.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, jsonify, send_file, request, send_from_directory
|
| 2 |
+
from flask_cors import CORS
|
| 3 |
+
import os, json, uuid, time
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from datetime import datetime, timedelta
|
| 6 |
+
from huggingface_hub import HfApi
|
| 7 |
+
import sys
|
| 8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
+
from system.pledge_tracking import run_pipeline
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
+
import spacy
|
| 13 |
+
import traceback
|
| 14 |
+
import threading
|
| 15 |
+
|
| 16 |
+
nlp = spacy.load("en_core_web_sm")
|
| 17 |
+
|
| 18 |
+
app = Flask(__name__, static_folder='.')
|
| 19 |
+
CORS(app)
|
| 20 |
+
|
| 21 |
+
HF_DATASET_REPO = "PledgeTracker/demo_feedback"
|
| 22 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 23 |
+
TMP_DIR = "tmp"
|
| 24 |
+
FEEDBACK_DIR = "feedback_logs"
|
| 25 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
| 26 |
+
os.makedirs(FEEDBACK_DIR, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
REFERENCE_PLEDGES = []
|
| 29 |
+
|
| 30 |
+
REFERENCE_PLEDGE_PATH = hf_hub_download(
|
| 31 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 32 |
+
filename="existing_pledges.txt",
|
| 33 |
+
repo_type="dataset",
|
| 34 |
+
token=os.environ["HF_TOKEN"]
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
if os.path.exists(REFERENCE_PLEDGE_PATH):
|
| 38 |
+
with open(REFERENCE_PLEDGE_PATH, "r") as f:
|
| 39 |
+
REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()]
|
| 40 |
+
else:
|
| 41 |
+
print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def lemmatize(text):
|
| 45 |
+
doc = nlp(text)
|
| 46 |
+
return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@app.route("/api/similar-pledges", methods=["POST"])
|
| 50 |
+
def similar_pledges():
|
| 51 |
+
data = request.get_json()
|
| 52 |
+
claim = data.get("claim", "").strip()
|
| 53 |
+
if not claim or not REFERENCE_PLEDGES:
|
| 54 |
+
return jsonify({"suggestions": []})
|
| 55 |
+
|
| 56 |
+
all_pledges = [claim] + REFERENCE_PLEDGES
|
| 57 |
+
lemmatized_pledges = [lemmatize(p) for p in all_pledges]
|
| 58 |
+
|
| 59 |
+
vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges)
|
| 60 |
+
similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
|
| 61 |
+
filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3]
|
| 62 |
+
top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5]
|
| 63 |
+
|
| 64 |
+
suggestions = [
|
| 65 |
+
{"text": REFERENCE_PLEDGES[i], "index": int(i)}
|
| 66 |
+
for i, score in top_filtered
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
return jsonify({"suggestions": suggestions})
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def calculate_time_range(option: str, pledge_date: str = None):
|
| 73 |
+
today = datetime.today()
|
| 74 |
+
# pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
|
| 75 |
+
|
| 76 |
+
if isinstance(pledge_date, str):
|
| 77 |
+
pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
|
| 78 |
+
elif not isinstance(pledge_date, datetime):
|
| 79 |
+
raise ValueError("pledge_date must be a str or datetime")
|
| 80 |
+
|
| 81 |
+
if option == "week":
|
| 82 |
+
one_week_ago = today - timedelta(days=7)
|
| 83 |
+
start = max(one_week_ago, pledge_date)
|
| 84 |
+
elif option == "month":
|
| 85 |
+
one_month_ago = today - timedelta(days=30)
|
| 86 |
+
start = max(one_month_ago, pledge_date)
|
| 87 |
+
elif option == "since_pledge_date":
|
| 88 |
+
if not pledge_date:
|
| 89 |
+
raise ValueError("Pledge date is required for 'since_pledge_date' option")
|
| 90 |
+
start = pledge_date
|
| 91 |
+
else:
|
| 92 |
+
raise ValueError("Invalid time range option")
|
| 93 |
+
print(start, pledge_date)
|
| 94 |
+
return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
|
| 95 |
+
|
| 96 |
+
@app.route("/")
|
| 97 |
+
def serve_html():
|
| 98 |
+
return send_from_directory('.', 'test.html')
|
| 99 |
+
|
| 100 |
+
@app.route("/api/status")
|
| 101 |
+
def check_status():
|
| 102 |
+
user_id = request.args.get("user_id")
|
| 103 |
+
timestamp = request.args.get("timestamp")
|
| 104 |
+
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
|
| 105 |
+
if not os.path.exists(log_file_path):
|
| 106 |
+
return jsonify({"status": {}}), 200
|
| 107 |
+
try:
|
| 108 |
+
with open(log_file_path, "r") as f:
|
| 109 |
+
status = json.load(f)
|
| 110 |
+
except Exception:
|
| 111 |
+
status = {}
|
| 112 |
+
|
| 113 |
+
return jsonify({"status": status})
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@app.route("/api/run-model", methods=["POST"])
|
| 117 |
+
def run_model():
|
| 118 |
+
data = request.get_json()
|
| 119 |
+
claim = data.get("claim", "no input")
|
| 120 |
+
time_range_option = data.get("time_range", "month")
|
| 121 |
+
system_start_time = datetime.now()
|
| 122 |
+
|
| 123 |
+
suggestion_meta = data.get("suggestion_meta")
|
| 124 |
+
pledge_date = data.get("pledge_date", "")
|
| 125 |
+
pledge_author = data.get("pledge_author", "")
|
| 126 |
+
timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S")
|
| 127 |
+
user_id = data.get("user_id") or str(uuid.uuid4())[:8]
|
| 128 |
+
|
| 129 |
+
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
|
| 130 |
+
|
| 131 |
+
status_lock = threading.Lock()
|
| 132 |
+
|
| 133 |
+
def update_status(step_id, msg):
|
| 134 |
+
print(f"[STATUS] Step {step_id}: {msg}")
|
| 135 |
+
with status_lock:
|
| 136 |
+
if os.path.exists(log_file_path):
|
| 137 |
+
try:
|
| 138 |
+
with open(log_file_path, "r") as f:
|
| 139 |
+
current = json.load(f)
|
| 140 |
+
except Exception:
|
| 141 |
+
current = {}
|
| 142 |
+
else:
|
| 143 |
+
current = {}
|
| 144 |
+
current[str(step_id)] = f"{msg}"
|
| 145 |
+
with open(log_file_path, "w") as f:
|
| 146 |
+
json.dump(current, f, indent=2)
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
|
| 150 |
+
print(f"[DEMO] Received claim: {claim}")
|
| 151 |
+
print(f"[DEMO] Time range: {time_start} ~ {time_end}")
|
| 152 |
+
print(f"[DEMO] Pledge date range: {pledge_date}")
|
| 153 |
+
|
| 154 |
+
# user_id = str(uuid.uuid4())[:8]
|
| 155 |
+
# outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
update_status(0, "π Starting the system ...")
|
| 159 |
+
print(suggestion_meta)
|
| 160 |
+
|
| 161 |
+
outputs = run_pipeline(
|
| 162 |
+
claim, pledge_date, pledge_author, time_start, timestamp, user_id,
|
| 163 |
+
update_fn=update_status, suggestion_meta=suggestion_meta
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
df = pd.read_excel(outputs["sorted_events"])
|
| 167 |
+
json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json")
|
| 168 |
+
df.to_json(json_path, orient="records", indent=2)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
system_end_time = datetime.now()
|
| 172 |
+
runtime = system_end_time - system_start_time
|
| 173 |
+
|
| 174 |
+
events = df.to_dict(orient="records")
|
| 175 |
+
log_entry = {
|
| 176 |
+
"requested_time": timestamp,
|
| 177 |
+
"user_id": user_id,
|
| 178 |
+
"pledge": claim,
|
| 179 |
+
"suggestion_meta": suggestion_meta,
|
| 180 |
+
"time_start": time_start,
|
| 181 |
+
"time_end": time_end,
|
| 182 |
+
"runtime": runtime.total_seconds(),
|
| 183 |
+
"pledge_author": pledge_author,
|
| 184 |
+
"pledge_date": pledge_date,
|
| 185 |
+
"events": events
|
| 186 |
+
}
|
| 187 |
+
default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
|
| 188 |
+
|
| 189 |
+
with open(default_log_path, "w") as f:
|
| 190 |
+
f.write(json.dumps(log_entry, indent=1))
|
| 191 |
+
|
| 192 |
+
tsv_path = outputs["augmented_tsv_file"]
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
api = HfApi()
|
| 196 |
+
api.upload_file(
|
| 197 |
+
path_or_fileobj=default_log_path,
|
| 198 |
+
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
|
| 199 |
+
repo_id=HF_DATASET_REPO,
|
| 200 |
+
repo_type="dataset",
|
| 201 |
+
token=HF_TOKEN
|
| 202 |
+
)
|
| 203 |
+
api.upload_file(
|
| 204 |
+
path_or_fileobj=tsv_path,
|
| 205 |
+
path_in_repo=f"logs/augmented_{timestamp}_{user_id}.tsv",
|
| 206 |
+
repo_id=HF_DATASET_REPO,
|
| 207 |
+
repo_type="dataset",
|
| 208 |
+
token=HF_TOKEN
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
traceback.print_exc()
|
| 214 |
+
print(f"[Default Feedback Upload Error] {e}")
|
| 215 |
+
|
| 216 |
+
return jsonify({
|
| 217 |
+
"status": "success",
|
| 218 |
+
"file": f"{timestamp}_{user_id}.json",
|
| 219 |
+
"user_id": user_id,
|
| 220 |
+
"timestamp": timestamp
|
| 221 |
+
})
|
| 222 |
+
except Exception as e:
|
| 223 |
+
traceback.print_exc()
|
| 224 |
+
return jsonify({"status": "error", "detail": str(e)}), 500
|
| 225 |
+
|
| 226 |
+
@app.route("/api/events")
|
| 227 |
+
def get_events():
|
| 228 |
+
filename = request.args.get("file")
|
| 229 |
+
file_path = os.path.join(TMP_DIR, filename)
|
| 230 |
+
|
| 231 |
+
if not os.path.exists(file_path):
|
| 232 |
+
return jsonify({"error": "File not found"}), 404
|
| 233 |
+
|
| 234 |
+
with open(file_path, "r") as f:
|
| 235 |
+
events = json.load(f)
|
| 236 |
+
|
| 237 |
+
return jsonify(events)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
@app.route("/api/feedback", methods=["POST"])
|
| 241 |
+
def receive_feedback():
|
| 242 |
+
data = request.get_json()
|
| 243 |
+
pledge = data.get("pledge", "no_pledge_text")
|
| 244 |
+
feedback_list = data.get("feedback", [])
|
| 245 |
+
filename = data.get("file")
|
| 246 |
+
file_path = os.path.join(TMP_DIR, filename)
|
| 247 |
+
|
| 248 |
+
timestamp = data.get("timestamp")
|
| 249 |
+
user_id = data.get("user_id")
|
| 250 |
+
|
| 251 |
+
if not user_id or not timestamp:
|
| 252 |
+
return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
|
| 253 |
+
|
| 254 |
+
if not os.path.exists(file_path):
|
| 255 |
+
return jsonify({"error": "Event file not found"}), 400
|
| 256 |
+
|
| 257 |
+
with open(file_path, "r") as f:
|
| 258 |
+
events = json.load(f)
|
| 259 |
+
|
| 260 |
+
suggestion_meta = None
|
| 261 |
+
time_start = None
|
| 262 |
+
time_end = None
|
| 263 |
+
try:
|
| 264 |
+
prev_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
|
| 265 |
+
with open(prev_log_path, "r") as f:
|
| 266 |
+
previous_log = json.load(f)
|
| 267 |
+
suggestion_meta = previous_log.get("suggestion_meta")
|
| 268 |
+
time_start = previous_log.get("time_start")
|
| 269 |
+
time_end = previous_log.get("time_end")
|
| 270 |
+
pledge_author = previous_log.get("pledge_author")
|
| 271 |
+
pledge_date = previous_log.get("pledge_date")
|
| 272 |
+
runtime = previous_log.get("runtime")
|
| 273 |
+
except Exception:
|
| 274 |
+
pass
|
| 275 |
+
|
| 276 |
+
feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
|
| 277 |
+
for idx, event in enumerate(events):
|
| 278 |
+
event["user_feedback"] = feedback_dict.get(idx)
|
| 279 |
+
|
| 280 |
+
log_entry = {
|
| 281 |
+
"requested_time": timestamp,
|
| 282 |
+
"user_id": user_id,
|
| 283 |
+
"pledge": pledge,
|
| 284 |
+
"suggestion_meta": suggestion_meta,
|
| 285 |
+
"time_start": time_start,
|
| 286 |
+
"time_end": time_end,
|
| 287 |
+
"runtime": runtime,
|
| 288 |
+
"pledge_author": pledge_author,
|
| 289 |
+
"pledge_date": pledge_date,
|
| 290 |
+
"events": events
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
|
| 294 |
+
with open(local_filename, "w") as f:
|
| 295 |
+
f.write(json.dumps(log_entry, indent=1))
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
api = HfApi()
|
| 299 |
+
api.upload_file(
|
| 300 |
+
path_or_fileobj=local_filename,
|
| 301 |
+
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
|
| 302 |
+
repo_id=HF_DATASET_REPO,
|
| 303 |
+
repo_type="dataset",
|
| 304 |
+
token=HF_TOKEN
|
| 305 |
+
)
|
| 306 |
+
except Exception as e:
|
| 307 |
+
return jsonify({'status': 'partial_success', 'error': str(e)}), 500
|
| 308 |
+
|
| 309 |
+
return jsonify({'status': 'success'})
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
@app.route("/download-feedback/<filename>")
|
| 313 |
+
def download_feedback_file(filename):
|
| 314 |
+
return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True)
|
| 315 |
+
|
| 316 |
+
@app.route("/feedback-files")
|
| 317 |
+
def list_feedback_files():
|
| 318 |
+
files = os.listdir(FEEDBACK_DIR)
|
| 319 |
+
return jsonify(sorted(files))
|
| 320 |
+
|
| 321 |
+
@app.route("/download")
|
| 322 |
+
def download_excel():
|
| 323 |
+
file = request.args.get("file")
|
| 324 |
+
if not file:
|
| 325 |
+
return "Missing file param", 400
|
| 326 |
+
|
| 327 |
+
json_path = os.path.join(TMP_DIR, file)
|
| 328 |
+
if not os.path.exists(json_path):
|
| 329 |
+
return "Event file not found", 404
|
| 330 |
+
|
| 331 |
+
with open(json_path, "r") as f:
|
| 332 |
+
data = json.load(f)
|
| 333 |
+
|
| 334 |
+
df = pd.DataFrame(data)
|
| 335 |
+
xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx"))
|
| 336 |
+
df.to_excel(xlsx_path, index=False)
|
| 337 |
+
|
| 338 |
+
return send_file(xlsx_path, as_attachment=True)
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
if __name__ == '__main__':
|
| 342 |
+
app.run(host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask
|
| 2 |
+
flask_cors
|
| 3 |
+
pandas
|
| 4 |
+
openpyxl
|
| 5 |
+
huggingface_hub
|
| 6 |
+
PyMuPDF==1.23.25
|
| 7 |
+
huggingface_hub==0.30.2
|
| 8 |
+
lxml==5.3.1
|
| 9 |
+
nltk==3.9.1
|
| 10 |
+
numpy==2.2.6
|
| 11 |
+
openai==1.84.0
|
| 12 |
+
pandas==2.3.0
|
| 13 |
+
rank_bm25==0.2.2
|
| 14 |
+
Requests==2.32.3
|
| 15 |
+
scikit_learn==1.7.0
|
| 16 |
+
sentence_transformers==3.3.1
|
| 17 |
+
spacy==3.8.2
|
| 18 |
+
tiktoken==0.7.0
|
| 19 |
+
torch==2.6.0
|
| 20 |
+
tqdm
|
| 21 |
+
trafilatura==2.0.0
|
| 22 |
+
transformers==4.51.3
|
| 23 |
+
vllm==0.8.4
|
| 24 |
+
accelerate
|
| 25 |
+
|
system/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
system/__init__.py
ADDED
|
File without changes
|
system/__pycache__/augmented_searching.cpython-312.pyc
ADDED
|
Binary file (4.73 kB). View file
|
|
|
system/__pycache__/ee.cpython-312.pyc
ADDED
|
Binary file (4.71 kB). View file
|
|
|
system/__pycache__/generate_output.cpython-312.pyc
ADDED
|
Binary file (3.47 kB). View file
|
|
|
system/__pycache__/hero_pipeline.cpython-312.pyc
ADDED
|
Binary file (6.22 kB). View file
|
|
|
system/__pycache__/html2lines.cpython-312.pyc
ADDED
|
Binary file (3.15 kB). View file
|
|
|
system/__pycache__/initial_searching.cpython-312.pyc
ADDED
|
Binary file (5.25 kB). View file
|
|
|
system/__pycache__/process_time.cpython-312.pyc
ADDED
|
Binary file (8.95 kB). View file
|
|
|
system/__pycache__/scraper.cpython-312.pyc
ADDED
|
Binary file (4.76 kB). View file
|
|
|
system/augmented_searching.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import requests
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from .date_verifier import is_after_start
|
| 7 |
+
|
| 8 |
+
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
| 9 |
+
# print(f"[SYSTEM] Calling Google Search API for: {query}")
|
| 10 |
+
sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
|
| 11 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
| 12 |
+
params = {
|
| 13 |
+
"q": query,
|
| 14 |
+
"key": api_key,
|
| 15 |
+
"cx": search_engine_id,
|
| 16 |
+
"num": 10,
|
| 17 |
+
"sort": sort,
|
| 18 |
+
"cr": "countryUK",
|
| 19 |
+
"gl": "uk"
|
| 20 |
+
}
|
| 21 |
+
try:
|
| 22 |
+
response = requests.get(url, params=params)
|
| 23 |
+
response.raise_for_status()
|
| 24 |
+
return response.json().get("items", [])
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"[ERROR] Google Search Failed: {e}")
|
| 27 |
+
return []
|
| 28 |
+
|
| 29 |
+
def save_tsv(file_name, id_value, string_value, value_list, query):
|
| 30 |
+
|
| 31 |
+
data = {
|
| 32 |
+
'ID': id_value,
|
| 33 |
+
'String': string_value,
|
| 34 |
+
'ListValue': value_list,
|
| 35 |
+
'query': query
|
| 36 |
+
}
|
| 37 |
+
df = pd.DataFrame(data)
|
| 38 |
+
df.to_csv(file_name, sep='\t', index=False, header=False)
|
| 39 |
+
|
| 40 |
+
def ensure_directory_exists(path):
|
| 41 |
+
dir_path = Path(path).expanduser().resolve().parent
|
| 42 |
+
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
|
| 43 |
+
raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
|
| 44 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
|
| 47 |
+
if suggestion_meta==None:
|
| 48 |
+
qa_lines = open(f"{qa_file}","r").read()
|
| 49 |
+
qa_lines = json.loads(qa_lines)
|
| 50 |
+
claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
|
| 51 |
+
idx=0
|
| 52 |
+
else:
|
| 53 |
+
# claim_text = suggestion_meta["text"]
|
| 54 |
+
idx = suggestion_meta["index"]
|
| 55 |
+
qa_lines = open(f"{qa_file}","r").readlines()[idx]
|
| 56 |
+
qa_lines = json.loads(qa_lines)
|
| 57 |
+
claim_text = f"{qa_lines['claim']}"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
| 61 |
+
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
|
| 62 |
+
if not api_key or not search_engine_id:
|
| 63 |
+
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
|
| 64 |
+
|
| 65 |
+
# base_dir = pipeline_base_dir
|
| 66 |
+
|
| 67 |
+
tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
|
| 68 |
+
ensure_directory_exists(tsv_file_path)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
urls = []
|
| 72 |
+
string_values = []
|
| 73 |
+
queries = []
|
| 74 |
+
questions = []
|
| 75 |
+
questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
|
| 76 |
+
questions = questions[:10]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
|
| 80 |
+
for result in results:
|
| 81 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
|
| 82 |
+
string_values.append("claim")
|
| 83 |
+
urls.append(result["link"])
|
| 84 |
+
queries.append(f"{pledge_author}: {claim_text}")
|
| 85 |
+
|
| 86 |
+
for question in questions:
|
| 87 |
+
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
|
| 88 |
+
for result in results:
|
| 89 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
|
| 90 |
+
string_values.append("question")
|
| 91 |
+
urls.append(result["link"])
|
| 92 |
+
queries.append(f"{question}")
|
| 93 |
+
|
| 94 |
+
urls = list(dict.fromkeys(urls))
|
| 95 |
+
|
| 96 |
+
save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
|
| 97 |
+
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
|
| 98 |
+
return str(tsv_file_path)
|
system/baseline/hyde_fc_generation_optimized.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from vllm import LLM, SamplingParams
|
| 2 |
+
import json
|
| 3 |
+
import torch
|
| 4 |
+
import time
|
| 5 |
+
from datetime import datetime, timedelta
|
| 6 |
+
import argparse
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
from typing import List, Dict, Any
|
| 9 |
+
import concurrent.futures
|
| 10 |
+
|
| 11 |
+
class VLLMGenerator:
|
| 12 |
+
def __init__(self, model_name: str, n: int = 8, max_tokens: int = 512,
|
| 13 |
+
temperature: float = 0.7, top_p: float = 1.0,
|
| 14 |
+
frequency_penalty: float = 0.0, presence_penalty: float = 0.0,
|
| 15 |
+
stop: List[str] = ['\n\n\n'], batch_size: int = 32):
|
| 16 |
+
self.device_count = torch.cuda.device_count()
|
| 17 |
+
print(f"Initializing with {self.device_count} GPUs")
|
| 18 |
+
self.llm = LLM(
|
| 19 |
+
model=model_name,
|
| 20 |
+
tensor_parallel_size=self.device_count,
|
| 21 |
+
max_model_len=4096,
|
| 22 |
+
gpu_memory_utilization=0.95,
|
| 23 |
+
enforce_eager=True,
|
| 24 |
+
trust_remote_code=True,
|
| 25 |
+
# quantization="bitsandbytes",
|
| 26 |
+
# dtype="half",
|
| 27 |
+
# load_format="bitsandbytes",
|
| 28 |
+
max_num_batched_tokens=4096,
|
| 29 |
+
max_num_seqs=batch_size
|
| 30 |
+
)
|
| 31 |
+
self.sampling_params = SamplingParams(
|
| 32 |
+
n=n,
|
| 33 |
+
max_tokens=max_tokens,
|
| 34 |
+
temperature=temperature,
|
| 35 |
+
top_p=top_p,
|
| 36 |
+
frequency_penalty=frequency_penalty,
|
| 37 |
+
presence_penalty=presence_penalty,
|
| 38 |
+
stop=stop,
|
| 39 |
+
logprobs=1
|
| 40 |
+
)
|
| 41 |
+
self.batch_size = batch_size
|
| 42 |
+
self.tokenizer = self.llm.get_tokenizer()
|
| 43 |
+
print(f"Initialization complete. Batch size: {batch_size}")
|
| 44 |
+
|
| 45 |
+
def parse_response(self, responses):
|
| 46 |
+
all_outputs = []
|
| 47 |
+
for response in responses:
|
| 48 |
+
to_return = []
|
| 49 |
+
for output in response.outputs:
|
| 50 |
+
text = output.text.strip()
|
| 51 |
+
try:
|
| 52 |
+
logprob = sum(logprob_obj.logprob for item in output.logprobs for logprob_obj in item.values())
|
| 53 |
+
except:
|
| 54 |
+
logprob = 0 # Fallback if logprobs aren't available
|
| 55 |
+
to_return.append((text, logprob))
|
| 56 |
+
texts = [r[0] for r in sorted(to_return, key=lambda tup: tup[1], reverse=True)]
|
| 57 |
+
all_outputs.append(texts)
|
| 58 |
+
return all_outputs
|
| 59 |
+
|
| 60 |
+
def prepare_prompt(self, claim: str, model_name: str) -> str:
|
| 61 |
+
base_prompt = f"Please write a fact-checking article passage to support, refute, indicate not enough evidence, or present conflicting evidence regarding the claim.\nClaim: {claim}"
|
| 62 |
+
|
| 63 |
+
if "OLMo" in model_name:
|
| 64 |
+
return base_prompt
|
| 65 |
+
else:
|
| 66 |
+
messages = [{"role": "user", "content": base_prompt}]
|
| 67 |
+
return self.tokenizer.apply_chat_template(messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\nPassage: "
|
| 68 |
+
|
| 69 |
+
def process_batch(self, batch: List[Dict[str, Any]], model_name: str) -> tuple[List[Dict[str, Any]], float]:
|
| 70 |
+
start_time = time.time()
|
| 71 |
+
prompts = [self.prepare_prompt(example["claim"], model_name) for example in batch]
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
results = self.llm.generate(prompts, sampling_params=self.sampling_params)
|
| 75 |
+
outputs = self.parse_response(results)
|
| 76 |
+
|
| 77 |
+
for example, output in zip(batch, outputs):
|
| 78 |
+
example['hypo_fc_docs'] = output
|
| 79 |
+
|
| 80 |
+
batch_time = time.time() - start_time
|
| 81 |
+
return batch, batch_time
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"Error processing batch: {str(e)}")
|
| 84 |
+
return batch, time.time() - start_time
|
| 85 |
+
|
| 86 |
+
# def format_time(seconds: float) -> str:
|
| 87 |
+
# return str(timedelta(seconds=int(seconds)))
|
| 88 |
+
|
| 89 |
+
# def estimate_completion_time(start_time: float, processed_examples: int, total_examples: int) -> str:
|
| 90 |
+
# elapsed_time = time.time() - start_time
|
| 91 |
+
# examples_per_second = processed_examples / elapsed_time
|
| 92 |
+
# remaining_examples = total_examples - processed_examples
|
| 93 |
+
# estimated_remaining_seconds = remaining_examples / examples_per_second
|
| 94 |
+
# completion_time = datetime.now() + timedelta(seconds=int(estimated_remaining_seconds))
|
| 95 |
+
# return completion_time.strftime("%Y-%m-%d %H:%M:%S")
|
| 96 |
+
|
| 97 |
+
def main(args):
|
| 98 |
+
total_start_time = time.time()
|
| 99 |
+
print(f"Script started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 100 |
+
|
| 101 |
+
# Load data
|
| 102 |
+
print("Loading data...")
|
| 103 |
+
with open(args.target_data, 'r', encoding='utf-8') as json_file:
|
| 104 |
+
examples = json.load(json_file)
|
| 105 |
+
print(f"Loaded {len(examples)} examples")
|
| 106 |
+
|
| 107 |
+
# Initialize generator
|
| 108 |
+
print("Initializing generator...")
|
| 109 |
+
generator = VLLMGenerator(
|
| 110 |
+
model_name=args.model,
|
| 111 |
+
batch_size=32
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Process data in batches
|
| 115 |
+
processed_data = []
|
| 116 |
+
# batch_times = []
|
| 117 |
+
batches = [examples[i:i + generator.batch_size] for i in range(0, len(examples), generator.batch_size)]
|
| 118 |
+
|
| 119 |
+
print(f"\nProcessing {len(batches)} batches...")
|
| 120 |
+
with tqdm(total=len(examples), desc="Processing examples") as pbar:
|
| 121 |
+
for batch_idx, batch in enumerate(batches, 1):
|
| 122 |
+
processed_batch, batch_time = generator.process_batch(batch, args.model)
|
| 123 |
+
processed_data.extend(processed_batch)
|
| 124 |
+
# batch_times.append(batch_time)
|
| 125 |
+
|
| 126 |
+
# Update progress and timing information
|
| 127 |
+
# examples_processed = len(processed_data)
|
| 128 |
+
# avg_batch_time = sum(batch_times) / len(batch_times)
|
| 129 |
+
# estimated_completion = estimate_completion_time(total_start_time, examples_processed, len(examples))
|
| 130 |
+
|
| 131 |
+
# pbar.set_postfix({
|
| 132 |
+
# 'Batch': f"{batch_idx}/{len(batches)}",
|
| 133 |
+
# 'Avg Batch Time': f"{avg_batch_time:.2f}s",
|
| 134 |
+
# 'ETA': estimated_completion
|
| 135 |
+
# })
|
| 136 |
+
# pbar.update(len(batch))
|
| 137 |
+
|
| 138 |
+
# Calculate and display timing statistics
|
| 139 |
+
# total_time = time.time() - total_start_time
|
| 140 |
+
# avg_batch_time = sum(batch_times) / len(batch_times)
|
| 141 |
+
# avg_example_time = total_time / len(examples)
|
| 142 |
+
|
| 143 |
+
# print("\nTiming Statistics:")
|
| 144 |
+
# print(f"Total Runtime: {format_time(total_time)}")
|
| 145 |
+
# print(f"Average Batch Time: {avg_batch_time:.2f} seconds")
|
| 146 |
+
# print(f"Average Time per Example: {avg_example_time:.2f} seconds")
|
| 147 |
+
# print(f"Throughput: {len(examples)/total_time:.2f} examples/second")
|
| 148 |
+
|
| 149 |
+
# Save results
|
| 150 |
+
# print("\nSaving results...")
|
| 151 |
+
with open(args.json_output, "w", encoding="utf-8") as output_json:
|
| 152 |
+
json.dump(processed_data, output_json, ensure_ascii=False, indent=4)
|
| 153 |
+
|
| 154 |
+
# print(f"Script completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 155 |
+
# print(f"Total runtime: {format_time(total_time)}")
|
| 156 |
+
|
| 157 |
+
if __name__ == "__main__":
|
| 158 |
+
parser = argparse.ArgumentParser()
|
| 159 |
+
parser.add_argument('-i', '--target_data', default='data_store/averitec/dev.json')
|
| 160 |
+
parser.add_argument('-o', '--json_output', default='data_store/hyde_fc.json')
|
| 161 |
+
parser.add_argument('-m', '--model', default="meta-llama/Llama-3.1-8B-Instruct")
|
| 162 |
+
args = parser.parse_args()
|
| 163 |
+
main(args)
|
system/baseline/question_generation_optimized.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import argparse
|
| 3 |
+
import time
|
| 4 |
+
import json
|
| 5 |
+
import nltk
|
| 6 |
+
from rank_bm25 import BM25Okapi
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
from vllm import LLM, SamplingParams
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from itertools import islice
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def download_nltk_data(package_name, download_dir='nltk_data'):
|
| 15 |
+
# Ensure the download directory exists
|
| 16 |
+
os.makedirs(download_dir, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
# Set NLTK data path
|
| 19 |
+
nltk.data.path.append(download_dir)
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
# Try to find the resource
|
| 23 |
+
nltk.data.find(f'tokenizers/{package_name}')
|
| 24 |
+
print(f"Package '{package_name}' is already downloaded")
|
| 25 |
+
except LookupError:
|
| 26 |
+
# If resource isn't found, download it
|
| 27 |
+
print(f"Downloading {package_name}...")
|
| 28 |
+
nltk.download(package_name, download_dir=download_dir)
|
| 29 |
+
print(f"Successfully downloaded {package_name}")
|
| 30 |
+
|
| 31 |
+
# def format_time(seconds):
|
| 32 |
+
# """Format time duration nicely."""
|
| 33 |
+
# return str(timedelta(seconds=round(seconds)))
|
| 34 |
+
|
| 35 |
+
def claim2prompts(example):
|
| 36 |
+
claim = example["claim"]
|
| 37 |
+
claim_str = "Example [NUMBER]:||Claim: " + claim + "||Evidence: "
|
| 38 |
+
|
| 39 |
+
for question in example["questions"]:
|
| 40 |
+
q_text = question["question"].strip()
|
| 41 |
+
if len(q_text) == 0:
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
if not q_text[-1] == "?":
|
| 45 |
+
q_text += "?"
|
| 46 |
+
|
| 47 |
+
answer_strings = []
|
| 48 |
+
|
| 49 |
+
for a in question["answers"]:
|
| 50 |
+
if a["answer_type"] in ["Extractive", "Abstractive"]:
|
| 51 |
+
answer_strings.append(a["answer"])
|
| 52 |
+
if a["answer_type"] == "Boolean":
|
| 53 |
+
answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
|
| 54 |
+
|
| 55 |
+
for a_text in answer_strings:
|
| 56 |
+
if not a_text[-1] in [".", "!", ":", "?"]:
|
| 57 |
+
a_text += "."
|
| 58 |
+
|
| 59 |
+
prompt_lookup_str = a_text
|
| 60 |
+
this_q_claim_str = claim_str + a_text.strip() + "||Question: " + q_text
|
| 61 |
+
yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n")[:1500])
|
| 62 |
+
|
| 63 |
+
def main(args):
|
| 64 |
+
# script_start = time.time()
|
| 65 |
+
# start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 66 |
+
# print(f"Script started at: {start_time}")
|
| 67 |
+
# print(f"Loading model: {args.model}")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
download_nltk_data('punkt')
|
| 71 |
+
download_nltk_data('punkt_tab')
|
| 72 |
+
|
| 73 |
+
# Load and prepare reference corpus
|
| 74 |
+
# corpus_start = time.time()
|
| 75 |
+
with open(args.reference_corpus, "r", encoding="utf-8") as json_file:
|
| 76 |
+
train_examples = json.load(json_file)
|
| 77 |
+
|
| 78 |
+
prompt_corpus, tokenized_corpus = [], []
|
| 79 |
+
for example in train_examples:
|
| 80 |
+
for lookup_str, prompt in claim2prompts(example):
|
| 81 |
+
entry = nltk.word_tokenize(lookup_str)
|
| 82 |
+
tokenized_corpus.append(entry)
|
| 83 |
+
prompt_corpus.append(prompt)
|
| 84 |
+
|
| 85 |
+
prompt_bm25 = BM25Okapi(tokenized_corpus)
|
| 86 |
+
# print(f"Reference corpus processed in: {format_time(time.time() - corpus_start)}")
|
| 87 |
+
|
| 88 |
+
# Initialize vLLM with optimized settings
|
| 89 |
+
gpu_count = torch.cuda.device_count()
|
| 90 |
+
print(f"Using {gpu_count} GPU{'s' if gpu_count > 1 else ''}")
|
| 91 |
+
|
| 92 |
+
# model_start = time.time()
|
| 93 |
+
llm = LLM(
|
| 94 |
+
model=args.model,
|
| 95 |
+
tensor_parallel_size=gpu_count,
|
| 96 |
+
max_model_len=4096,
|
| 97 |
+
gpu_memory_utilization=0.95,
|
| 98 |
+
enforce_eager=True,
|
| 99 |
+
trust_remote_code=True,
|
| 100 |
+
# dtype="half",
|
| 101 |
+
)
|
| 102 |
+
llm.get_tokenizer().pad_token = "<|end_of_text|>"
|
| 103 |
+
# print(f"Model loaded in: {format_time(time.time() - model_start)}")
|
| 104 |
+
|
| 105 |
+
sampling_params = SamplingParams(
|
| 106 |
+
temperature=0.6,
|
| 107 |
+
top_p=0.9,
|
| 108 |
+
top_k=1,
|
| 109 |
+
skip_special_tokens=False,
|
| 110 |
+
max_tokens=512,
|
| 111 |
+
stop=['<|end_of_text|>', '</s>', '<|im_end|>', '[INST]', '[/INST]','<|eot_id|>','<|end|>','<|endoftext|>']
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# processing_start = time.time()
|
| 115 |
+
|
| 116 |
+
# Load target data
|
| 117 |
+
target_examples = []
|
| 118 |
+
with open(args.top_k_target_knowledge, "r", encoding="utf-8") as json_file:
|
| 119 |
+
for line in json_file:
|
| 120 |
+
target_examples.append(json.loads(line))
|
| 121 |
+
|
| 122 |
+
if args.end == -1:
|
| 123 |
+
args.end = len(target_examples)
|
| 124 |
+
print(f"Processing {args.end} examples")
|
| 125 |
+
|
| 126 |
+
# Process in batches
|
| 127 |
+
with torch.no_grad():
|
| 128 |
+
with open(args.output_questions, "w", encoding="utf-8") as output_file:
|
| 129 |
+
for idx in range(0, args.end, args.batch_size):
|
| 130 |
+
batch_end = min(idx + args.batch_size, args.end)
|
| 131 |
+
current_batch = target_examples[idx:batch_end]
|
| 132 |
+
print(f"\nProcessing batch {idx}-{batch_end}...")
|
| 133 |
+
|
| 134 |
+
for example in current_batch:
|
| 135 |
+
# batch_start = time.time()
|
| 136 |
+
claim = example["claim"]
|
| 137 |
+
claim_id = example["claim_id"]
|
| 138 |
+
top_k_sentences_urls = example[f"top_{args.top_k}"]
|
| 139 |
+
|
| 140 |
+
batch_prompts = []
|
| 141 |
+
batch_metadata = []
|
| 142 |
+
|
| 143 |
+
# Prepare all prompts for current example
|
| 144 |
+
for sentences_urls in top_k_sentences_urls:
|
| 145 |
+
prompt_lookup_str = sentences_urls["sentence"]
|
| 146 |
+
url = sentences_urls["url"]
|
| 147 |
+
|
| 148 |
+
prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
|
| 149 |
+
prompt_n = 10
|
| 150 |
+
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
| 151 |
+
prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
|
| 152 |
+
|
| 153 |
+
temp_prompt = "\n\n".join(prompt_docs)
|
| 154 |
+
for k in range(1, temp_prompt.count("[NUMBER]")+1):
|
| 155 |
+
temp_prompt = temp_prompt.replace("[NUMBER]", f"{k}", 1)
|
| 156 |
+
|
| 157 |
+
claim_prompt = "Your task is to generate a question based on the given claim and evidence. The question should clarify the relationship between the evidence and the claim\n\n"
|
| 158 |
+
evidence = prompt_lookup_str.replace("\n", " ")
|
| 159 |
+
full_prompt = claim_prompt + temp_prompt + "\n\nNow, generate a question that links the following claim and evidence:" + f"\n\nClaim: {claim}" + f"\nEvidence: {evidence}"
|
| 160 |
+
|
| 161 |
+
if "OLMo" in args.model:
|
| 162 |
+
inputs = [full_prompt]
|
| 163 |
+
else:
|
| 164 |
+
messages = [{"role":"user", "content":full_prompt}]
|
| 165 |
+
inputs = llm.get_tokenizer().apply_chat_template(messages, tokenize=False)
|
| 166 |
+
inputs += "<|start_header_id|>assistant<|end_header_id|>\n\nQuestion: "
|
| 167 |
+
|
| 168 |
+
batch_prompts.append(inputs)
|
| 169 |
+
batch_metadata.append((url, prompt_lookup_str))
|
| 170 |
+
|
| 171 |
+
# Process batch
|
| 172 |
+
outputs = llm.generate(batch_prompts, sampling_params)
|
| 173 |
+
|
| 174 |
+
# Process outputs
|
| 175 |
+
evidence = []
|
| 176 |
+
for output, (url, sent) in zip(outputs, batch_metadata):
|
| 177 |
+
question = output.outputs[0].text.strip().split("?")[0].replace("\n", " ") + "?"
|
| 178 |
+
evidence.append({
|
| 179 |
+
"question": question,
|
| 180 |
+
"answer": sent,
|
| 181 |
+
"url": url
|
| 182 |
+
})
|
| 183 |
+
|
| 184 |
+
# Write results
|
| 185 |
+
json_data = {
|
| 186 |
+
"claim_id": claim_id,
|
| 187 |
+
"claim": claim,
|
| 188 |
+
"evidence": evidence
|
| 189 |
+
}
|
| 190 |
+
output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
| 191 |
+
output_file.flush()
|
| 192 |
+
|
| 193 |
+
# batch_time = time.time() - batch_start
|
| 194 |
+
# print(f"Processed example {claim_id}. Time elapsed: {batch_time:.2f}s")
|
| 195 |
+
|
| 196 |
+
# Calculate and display timing information
|
| 197 |
+
# total_time = time.time() - script_start
|
| 198 |
+
# processing_time = time.time() - processing_start
|
| 199 |
+
# end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 200 |
+
|
| 201 |
+
# print("\nTiming Summary:")
|
| 202 |
+
# print(f"Start time: {start_time}")
|
| 203 |
+
# print(f"End time: {end_time}")
|
| 204 |
+
# print(f"Total runtime: {format_time(total_time)}")
|
| 205 |
+
# print(f"Setup time: {format_time(processing_start - script_start)}")
|
| 206 |
+
# print(f"Processing time: {format_time(processing_time)}")
|
| 207 |
+
# print(f"Results written to: {args.output_questions}")
|
| 208 |
+
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
parser = argparse.ArgumentParser(description="Use a prompt to generate questions that could be answered by top-k retrieved evidence. Output generated questions.")
|
| 211 |
+
parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
|
| 212 |
+
parser.add_argument("--reference_corpus", default="baseline/train.json")
|
| 213 |
+
parser.add_argument(
|
| 214 |
+
"-i",
|
| 215 |
+
"--top_k_target_knowledge",
|
| 216 |
+
default="data_store/dev_reranking_top_k.json",
|
| 217 |
+
help="Directory where the sentences for the scraped data is saved.",
|
| 218 |
+
)
|
| 219 |
+
parser.add_argument(
|
| 220 |
+
"-o",
|
| 221 |
+
"--output_questions",
|
| 222 |
+
default="data_store/dev_top_k_qa.json",
|
| 223 |
+
help="Directory where the sentences for the scraped data is saved.",
|
| 224 |
+
)
|
| 225 |
+
parser.add_argument(
|
| 226 |
+
"--top_k",
|
| 227 |
+
default=10,
|
| 228 |
+
type=int
|
| 229 |
+
)
|
| 230 |
+
parser.add_argument(
|
| 231 |
+
"--batch_size",
|
| 232 |
+
type=int,
|
| 233 |
+
default=4,
|
| 234 |
+
help="Number of examples to process in each batch"
|
| 235 |
+
)
|
| 236 |
+
parser.add_argument(
|
| 237 |
+
"-e",
|
| 238 |
+
"--end",
|
| 239 |
+
type=int,
|
| 240 |
+
default=-1
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
args = parser.parse_args()
|
| 244 |
+
main(args)
|
system/baseline/reranking_optimized.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import gc
|
| 4 |
+
from transformers import AutoModel, AutoTokenizer
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import numpy as np
|
| 7 |
+
import json
|
| 8 |
+
import argparse
|
| 9 |
+
import time
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
import re
|
| 12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 14 |
+
|
| 15 |
+
def encode_text(model, tokenizer, texts, batch_size=8, max_length=512):
|
| 16 |
+
"""Encode texts to embeddings using AutoModel"""
|
| 17 |
+
all_embeddings = []
|
| 18 |
+
|
| 19 |
+
for i in range(0, len(texts), batch_size):
|
| 20 |
+
batch = texts[i:i + batch_size]
|
| 21 |
+
|
| 22 |
+
# Tokenize
|
| 23 |
+
encoded_input = tokenizer(
|
| 24 |
+
batch,
|
| 25 |
+
padding=True,
|
| 26 |
+
truncation=True,
|
| 27 |
+
max_length=max_length,
|
| 28 |
+
return_tensors='pt'
|
| 29 |
+
).to(model.device)
|
| 30 |
+
|
| 31 |
+
# Compute token embeddings
|
| 32 |
+
with torch.no_grad():
|
| 33 |
+
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
| 34 |
+
model_output = model(**encoded_input)
|
| 35 |
+
# Use mean pooling
|
| 36 |
+
attention_mask = encoded_input['attention_mask']
|
| 37 |
+
token_embeddings = model_output[0] # First element contains token embeddings
|
| 38 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 39 |
+
embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 40 |
+
all_embeddings.append(embeddings.cpu().numpy())
|
| 41 |
+
|
| 42 |
+
# Clear some memory
|
| 43 |
+
if i % (batch_size * 4) == 0:
|
| 44 |
+
torch.cuda.empty_cache()
|
| 45 |
+
gc.collect()
|
| 46 |
+
|
| 47 |
+
return np.vstack(all_embeddings)
|
| 48 |
+
|
| 49 |
+
def compute_similarity(emb1, emb2):
|
| 50 |
+
"""Compute cosine similarity between embeddings"""
|
| 51 |
+
return np.dot(emb1, emb2.T) / (
|
| 52 |
+
np.linalg.norm(emb1, axis=1).reshape(-1, 1) *
|
| 53 |
+
np.linalg.norm(emb2, axis=1).reshape(1, -1)
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
def get_detailed_instruct(task_description: str, query: str) -> str:
|
| 57 |
+
return f'Instruct: {task_description}\nQuery: {query}'
|
| 58 |
+
|
| 59 |
+
def preprocess_sentences(sentence1, sentence2):
|
| 60 |
+
vectorizer = TfidfVectorizer().fit_transform([sentence1, sentence2])
|
| 61 |
+
vectors = vectorizer.toarray()
|
| 62 |
+
|
| 63 |
+
cosine_sim = cosine_similarity(vectors)
|
| 64 |
+
similarity_score = cosine_sim[0][1]
|
| 65 |
+
return similarity_score
|
| 66 |
+
|
| 67 |
+
def remove_trailing_special_chars(text):
|
| 68 |
+
return re.sub(r'[\W_]+$', '', text)
|
| 69 |
+
|
| 70 |
+
def remove_special_chars_except_spaces(text):
|
| 71 |
+
return re.sub(r'[^\w\s]+', '', text)
|
| 72 |
+
|
| 73 |
+
def select_top_k(claim, results, top_k):
|
| 74 |
+
'''
|
| 75 |
+
remove sentence of similarity claim
|
| 76 |
+
'''
|
| 77 |
+
dup_check = set()
|
| 78 |
+
top_k_sentences_urls = []
|
| 79 |
+
|
| 80 |
+
i = 0
|
| 81 |
+
# print(results)
|
| 82 |
+
claim = remove_special_chars_except_spaces(claim).lower()
|
| 83 |
+
while len(top_k_sentences_urls) < top_k and i < len(results):
|
| 84 |
+
# print(i)
|
| 85 |
+
sentence = remove_special_chars_except_spaces(results[i]['sentence']).lower()
|
| 86 |
+
|
| 87 |
+
if sentence not in dup_check:
|
| 88 |
+
if preprocess_sentences(claim, sentence) > 0.97:
|
| 89 |
+
dup_check.add(sentence)
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
if claim in sentence:
|
| 93 |
+
if len(claim) / len(sentence) > 0.92:
|
| 94 |
+
dup_check.add(sentence)
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
top_k_sentences_urls.append({
|
| 98 |
+
'sentence': results[i]['sentence'],
|
| 99 |
+
'url': results[i]['url']}
|
| 100 |
+
)
|
| 101 |
+
i += 1
|
| 102 |
+
|
| 103 |
+
return top_k_sentences_urls
|
| 104 |
+
|
| 105 |
+
# def format_time(seconds):
|
| 106 |
+
# """Format time duration nicely."""
|
| 107 |
+
# return str(timedelta(seconds=round(seconds)))
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def compute_embeddings_batched(model, texts, batch_size=8):
|
| 111 |
+
"""Compute embeddings in smaller batches to manage memory"""
|
| 112 |
+
all_embeddings = []
|
| 113 |
+
for i in range(0, len(texts), batch_size):
|
| 114 |
+
batch = texts[i:i + batch_size]
|
| 115 |
+
with torch.cuda.amp.autocast(dtype=torch.bfloat16): # Use bfloat16
|
| 116 |
+
emb = model.encode(batch, batch_size=len(batch), show_progress_bar=False)
|
| 117 |
+
all_embeddings.append(emb)
|
| 118 |
+
|
| 119 |
+
# Clear some memory
|
| 120 |
+
if i % (batch_size * 4) == 0:
|
| 121 |
+
torch.cuda.empty_cache()
|
| 122 |
+
gc.collect()
|
| 123 |
+
|
| 124 |
+
return np.vstack(all_embeddings)
|
| 125 |
+
|
| 126 |
+
def main(args):
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
device = "cuda" if torch.cuda.is_available() else 'cpu'
|
| 130 |
+
print(f"Using device: {device}")
|
| 131 |
+
|
| 132 |
+
# Load model and tokenizer
|
| 133 |
+
model = AutoModel.from_pretrained(
|
| 134 |
+
"Salesforce/SFR-Embedding-2_R",
|
| 135 |
+
torch_dtype=torch.bfloat16,
|
| 136 |
+
low_cpu_mem_usage=True,
|
| 137 |
+
device_map="auto"
|
| 138 |
+
)
|
| 139 |
+
tokenizer = AutoTokenizer.from_pretrained("Salesforce/SFR-Embedding-2_R")
|
| 140 |
+
|
| 141 |
+
# Load target examples
|
| 142 |
+
target_examples = []
|
| 143 |
+
with open(args.target_data, "r", encoding="utf-8") as json_file:
|
| 144 |
+
for i, line in enumerate(json_file):
|
| 145 |
+
try:
|
| 146 |
+
example = json.loads(r"{}".format(line))
|
| 147 |
+
target_examples.append(example)
|
| 148 |
+
except:
|
| 149 |
+
print(f"CURRENT LINE broken {i}")
|
| 150 |
+
|
| 151 |
+
if args.end == -1:
|
| 152 |
+
args.end = len(target_examples)
|
| 153 |
+
|
| 154 |
+
files_to_process = list(range(args.start, args.end))
|
| 155 |
+
total = len(files_to_process)
|
| 156 |
+
|
| 157 |
+
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
| 158 |
+
|
| 159 |
+
with open(args.json_output, "w", encoding="utf-8") as output_json:
|
| 160 |
+
done = 0
|
| 161 |
+
for idx, example in enumerate(target_examples):
|
| 162 |
+
if idx in files_to_process:
|
| 163 |
+
print(f"Processing claim {example['claim_id']}... Progress: {done + 1} / {total}")
|
| 164 |
+
|
| 165 |
+
claim = example['claim']
|
| 166 |
+
query = [get_detailed_instruct(task, claim)] + [
|
| 167 |
+
get_detailed_instruct(task, le)
|
| 168 |
+
for le in example['hypo_fc_docs']
|
| 169 |
+
if len(le.strip()) > 0
|
| 170 |
+
]
|
| 171 |
+
query_length = len(query)
|
| 172 |
+
sentences = [sent['sentence'] for sent in example[f'top_{5000}']][:args.retrieved_top_k]
|
| 173 |
+
|
| 174 |
+
# st = time.time()
|
| 175 |
+
try:
|
| 176 |
+
# Process query embeddings
|
| 177 |
+
query_embeddings = encode_text(model, tokenizer, query, batch_size=4)
|
| 178 |
+
avg_emb_q = np.mean(query_embeddings, axis=0)
|
| 179 |
+
hyde_vector = avg_emb_q.reshape((1, -1))
|
| 180 |
+
|
| 181 |
+
# Process sentence embeddings in smaller chunks
|
| 182 |
+
sentence_embeddings = encode_text(
|
| 183 |
+
model,
|
| 184 |
+
tokenizer,
|
| 185 |
+
sentences,
|
| 186 |
+
batch_size=args.batch_size
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# Compute similarities in chunks to save memory
|
| 190 |
+
chunk_size = 1000
|
| 191 |
+
all_scores = []
|
| 192 |
+
for i in range(0, len(sentence_embeddings), chunk_size):
|
| 193 |
+
chunk = sentence_embeddings[i:i + chunk_size]
|
| 194 |
+
chunk_scores = compute_similarity(hyde_vector, chunk)[0]
|
| 195 |
+
all_scores.extend(chunk_scores)
|
| 196 |
+
|
| 197 |
+
scores = np.array(all_scores)
|
| 198 |
+
top_k_idx = np.argsort(scores)[::-1]
|
| 199 |
+
results = [example['top_5000'][i] for i in top_k_idx]
|
| 200 |
+
top_k_sentences_urls = select_top_k(claim, results, args.top_k)
|
| 201 |
+
|
| 202 |
+
# print(f"Top {args.top_k} retrieved. Time elapsed: {time.time() - st:.2f}s")
|
| 203 |
+
|
| 204 |
+
json_data = {
|
| 205 |
+
"claim_id": example['claim_id'],
|
| 206 |
+
"claim": claim,
|
| 207 |
+
f"top_{args.top_k}": top_k_sentences_urls
|
| 208 |
+
}
|
| 209 |
+
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
| 210 |
+
output_json.flush()
|
| 211 |
+
|
| 212 |
+
except RuntimeError as e:
|
| 213 |
+
print(f"Error processing claim {example['claim_id']}: {e}")
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
done += 1
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
if __name__ == "__main__":
|
| 220 |
+
parser = argparse.ArgumentParser()
|
| 221 |
+
parser.add_argument("--target_data", default="data_store/dev_retrieval_top_k.json")
|
| 222 |
+
parser.add_argument("--retrieved_top_k", type=int, default=5000)
|
| 223 |
+
parser.add_argument("--top_k", type=int, default=10)
|
| 224 |
+
parser.add_argument("-o", "--json_output", type=str, default="data_store/dev_reranking_top_k.json")
|
| 225 |
+
parser.add_argument("--batch_size", type=int, default=32)
|
| 226 |
+
parser.add_argument("-s", "--start", type=int, default=0)
|
| 227 |
+
parser.add_argument("-e", "--end", type=int, default=-1)
|
| 228 |
+
args = parser.parse_args()
|
| 229 |
+
|
| 230 |
+
main(args)
|
system/baseline/retrieval_optimized.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import nltk
|
| 8 |
+
from rank_bm25 import BM25Okapi
|
| 9 |
+
from multiprocessing import Pool, cpu_count, Manager, Lock
|
| 10 |
+
from functools import partial
|
| 11 |
+
import heapq
|
| 12 |
+
from threading import Thread, Event
|
| 13 |
+
import queue
|
| 14 |
+
from datetime import datetime, timedelta
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def download_nltk_data(package_name, download_dir='nltk_data'):
|
| 18 |
+
# Ensure the download directory exists
|
| 19 |
+
os.makedirs(download_dir, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# Set NLTK data path
|
| 22 |
+
nltk.data.path.append(download_dir)
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
# Try to find the resource
|
| 26 |
+
nltk.data.find(f'tokenizers/{package_name}')
|
| 27 |
+
print(f"Package '{package_name}' is already downloaded")
|
| 28 |
+
except LookupError:
|
| 29 |
+
# If resource isn't found, download it
|
| 30 |
+
print(f"Downloading {package_name}...")
|
| 31 |
+
nltk.download(package_name, download_dir=download_dir)
|
| 32 |
+
print(f"Successfully downloaded {package_name}")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def combine_all_sentences(knowledge_file):
|
| 36 |
+
sentences, urls = [], []
|
| 37 |
+
|
| 38 |
+
with open(knowledge_file, "r", encoding="utf-8") as json_file:
|
| 39 |
+
for i, line in enumerate(json_file):
|
| 40 |
+
data = json.loads(line)
|
| 41 |
+
sentences.extend(data["url2text"])
|
| 42 |
+
urls.extend([data["url"] for _ in range(len(data["url2text"]))])
|
| 43 |
+
return sentences, urls, i + 1
|
| 44 |
+
|
| 45 |
+
def remove_duplicates(sentences, urls):
|
| 46 |
+
df = pd.DataFrame({"document_in_sentences":sentences, "sentence_urls":urls})
|
| 47 |
+
df['sentences'] = df['document_in_sentences'].str.strip().str.lower()
|
| 48 |
+
df = df.drop_duplicates(subset="sentences").reset_index()
|
| 49 |
+
return df['document_in_sentences'].tolist(), df['sentence_urls'].tolist()
|
| 50 |
+
|
| 51 |
+
def retrieve_top_k_sentences(query, document, urls, top_k):
|
| 52 |
+
tokenized_docs = [nltk.word_tokenize(doc) for doc in document[:top_k]]
|
| 53 |
+
bm25 = BM25Okapi(tokenized_docs)
|
| 54 |
+
|
| 55 |
+
scores = bm25.get_scores(nltk.word_tokenize(query))
|
| 56 |
+
top_k_idx = np.argsort(scores)[::-1][:top_k]
|
| 57 |
+
|
| 58 |
+
return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]
|
| 59 |
+
|
| 60 |
+
def process_single_example(idx, example, args, result_queue, counter, lock):
|
| 61 |
+
try:
|
| 62 |
+
with lock:
|
| 63 |
+
current_count = counter.value + 1
|
| 64 |
+
counter.value = current_count
|
| 65 |
+
print(f"\nProcessing claim {idx}... Progress: {current_count} / {args.total_examples}")
|
| 66 |
+
|
| 67 |
+
# start_time = time.time()
|
| 68 |
+
|
| 69 |
+
document_in_sentences, sentence_urls, num_urls_this_claim = combine_all_sentences(
|
| 70 |
+
os.path.join(args.knowledge_store_dir, f"{idx}.jsonl")
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
print(f"Obtained {len(document_in_sentences)} sentences from {num_urls_this_claim} urls.")
|
| 74 |
+
|
| 75 |
+
document_in_sentences, sentence_urls = remove_duplicates(document_in_sentences, sentence_urls)
|
| 76 |
+
|
| 77 |
+
query = example["claim"] + " " + " ".join(example['hypo_fc_docs'])
|
| 78 |
+
top_k_sentences, top_k_urls = retrieve_top_k_sentences(
|
| 79 |
+
query, document_in_sentences, sentence_urls, args.top_k
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
result = {
|
| 84 |
+
"claim_id": idx,
|
| 85 |
+
"claim": example["claim"],
|
| 86 |
+
f"top_{args.top_k}": [
|
| 87 |
+
{"sentence": sent, "url": url}
|
| 88 |
+
for sent, url in zip(top_k_sentences, top_k_urls)
|
| 89 |
+
],
|
| 90 |
+
"hypo_fc_docs": example['hypo_fc_docs']
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
result_queue.put((idx, result))
|
| 94 |
+
return True
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"Error processing example {idx}: {str(e)}")
|
| 97 |
+
result_queue.put((idx, None))
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
def writer_thread(output_file, result_queue, total_examples, stop_event):
|
| 101 |
+
next_index = 0
|
| 102 |
+
pending_results = []
|
| 103 |
+
|
| 104 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 105 |
+
while not (stop_event.is_set() and result_queue.empty()):
|
| 106 |
+
try:
|
| 107 |
+
idx, result = result_queue.get(timeout=1)
|
| 108 |
+
|
| 109 |
+
if result is not None:
|
| 110 |
+
heapq.heappush(pending_results, (idx, result))
|
| 111 |
+
|
| 112 |
+
while pending_results and pending_results[0][0] == next_index:
|
| 113 |
+
_, result_to_write = heapq.heappop(pending_results)
|
| 114 |
+
f.write(json.dumps(result_to_write, ensure_ascii=False) + "\n")
|
| 115 |
+
f.flush()
|
| 116 |
+
next_index += 1
|
| 117 |
+
|
| 118 |
+
except queue.Empty:
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
# def format_time(seconds):
|
| 122 |
+
# """Format time duration nicely."""
|
| 123 |
+
# return str(timedelta(seconds=round(seconds)))
|
| 124 |
+
|
| 125 |
+
def main(args):
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
download_nltk_data('punkt')
|
| 130 |
+
download_nltk_data('punkt_tab')
|
| 131 |
+
|
| 132 |
+
with open(args.target_data, "r", encoding="utf-8") as json_file:
|
| 133 |
+
target_examples = json.load(json_file)
|
| 134 |
+
|
| 135 |
+
if args.end == -1:
|
| 136 |
+
args.end = len(target_examples)
|
| 137 |
+
|
| 138 |
+
print(f"Total examples to process: {args.end - args.start}")
|
| 139 |
+
|
| 140 |
+
files_to_process = list(range(args.start, args.end))
|
| 141 |
+
examples_to_process = [(idx, target_examples[idx]) for idx in files_to_process]
|
| 142 |
+
|
| 143 |
+
num_workers = min(args.workers if args.workers > 0 else cpu_count(), len(files_to_process))
|
| 144 |
+
print(f"Using {num_workers} workers to process {len(files_to_process)} examples")
|
| 145 |
+
|
| 146 |
+
with Manager() as manager:
|
| 147 |
+
counter = manager.Value('i', 0)
|
| 148 |
+
lock = manager.Lock()
|
| 149 |
+
args.total_examples = len(files_to_process)
|
| 150 |
+
|
| 151 |
+
result_queue = manager.Queue()
|
| 152 |
+
|
| 153 |
+
stop_event = Event()
|
| 154 |
+
writer = Thread(
|
| 155 |
+
target=writer_thread,
|
| 156 |
+
args=(args.json_output, result_queue, len(files_to_process), stop_event)
|
| 157 |
+
)
|
| 158 |
+
writer.start()
|
| 159 |
+
|
| 160 |
+
process_func = partial(
|
| 161 |
+
process_single_example,
|
| 162 |
+
args=args,
|
| 163 |
+
result_queue=result_queue,
|
| 164 |
+
counter=counter,
|
| 165 |
+
lock=lock
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
with Pool(num_workers) as pool:
|
| 169 |
+
results = pool.starmap(process_func, examples_to_process)
|
| 170 |
+
|
| 171 |
+
stop_event.set()
|
| 172 |
+
writer.join()
|
| 173 |
+
|
| 174 |
+
# successful = sum(1 for r in results if r)
|
| 175 |
+
# print(f"\nSuccessfully processed {successful} out of {len(files_to_process)} examples")
|
| 176 |
+
# print(f"Results written to {args.json_output}")
|
| 177 |
+
|
| 178 |
+
# # Calculate and display timing information
|
| 179 |
+
# total_time = time.time() - script_start
|
| 180 |
+
# avg_time = total_time / len(files_to_process)
|
| 181 |
+
# end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 182 |
+
|
| 183 |
+
# print("\nTiming Summary:")
|
| 184 |
+
# print(f"Start time: {start_time}")
|
| 185 |
+
# print(f"End time: {end_time}")
|
| 186 |
+
# print(f"Total runtime: {format_time(total_time)} (HH:MM:SS)")
|
| 187 |
+
# print(f"Average time per example: {avg_time:.2f} seconds")
|
| 188 |
+
# if successful > 0:
|
| 189 |
+
# print(f"Processing speed: {successful / total_time:.2f} examples per second")
|
| 190 |
+
|
| 191 |
+
if __name__ == "__main__":
|
| 192 |
+
parser = argparse.ArgumentParser(
|
| 193 |
+
description="Get top 10000 sentences with BM25 in the knowledge store using parallel processing."
|
| 194 |
+
)
|
| 195 |
+
parser.add_argument(
|
| 196 |
+
"-k",
|
| 197 |
+
"--knowledge_store_dir",
|
| 198 |
+
type=str,
|
| 199 |
+
default="data_store/knowledge_store",
|
| 200 |
+
help="The path of the knowledge_store_dir containing json files with all the retrieved sentences.",
|
| 201 |
+
)
|
| 202 |
+
parser.add_argument(
|
| 203 |
+
"--target_data",
|
| 204 |
+
type=str,
|
| 205 |
+
default="data_store/hyde_fc.json",
|
| 206 |
+
help="The path of the file that stores the claim.",
|
| 207 |
+
)
|
| 208 |
+
parser.add_argument(
|
| 209 |
+
"-o",
|
| 210 |
+
"--json_output",
|
| 211 |
+
type=str,
|
| 212 |
+
default="data_store/dev_retrieval_top_k.json",
|
| 213 |
+
help="The output dir for JSON files to save the top 100 sentences for each claim.",
|
| 214 |
+
)
|
| 215 |
+
parser.add_argument(
|
| 216 |
+
"--top_k",
|
| 217 |
+
default=5000,
|
| 218 |
+
type=int,
|
| 219 |
+
help="How many documents should we pick out with BM25.",
|
| 220 |
+
)
|
| 221 |
+
parser.add_argument(
|
| 222 |
+
"-s",
|
| 223 |
+
"--start",
|
| 224 |
+
type=int,
|
| 225 |
+
default=0,
|
| 226 |
+
help="Starting index of the files to process.",
|
| 227 |
+
)
|
| 228 |
+
parser.add_argument(
|
| 229 |
+
"-e",
|
| 230 |
+
"--end",
|
| 231 |
+
type=int,
|
| 232 |
+
default=-1,
|
| 233 |
+
help="End index of the files to process.",
|
| 234 |
+
)
|
| 235 |
+
parser.add_argument(
|
| 236 |
+
"-w",
|
| 237 |
+
"--workers",
|
| 238 |
+
type=int,
|
| 239 |
+
default=0,
|
| 240 |
+
help="Number of worker processes (default: number of CPU cores)",
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
args = parser.parse_args()
|
| 244 |
+
main(args)
|
system/baseline/train.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
system/date_verifier.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
import re, trafilatura
|
| 3 |
+
from trafilatura.settings import DEFAULT_CONFIG
|
| 4 |
+
|
| 5 |
+
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
|
| 6 |
+
|
| 7 |
+
_URL_DATE_PATS = [
|
| 8 |
+
re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
|
| 9 |
+
re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
|
| 10 |
+
re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
def _meta_date(url: str):
|
| 14 |
+
|
| 15 |
+
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
|
| 16 |
+
if not page:
|
| 17 |
+
return None
|
| 18 |
+
meta = trafilatura.extract_metadata(page)
|
| 19 |
+
if not meta or not meta.date:
|
| 20 |
+
return None
|
| 21 |
+
try:
|
| 22 |
+
return datetime.fromisoformat(meta.date)
|
| 23 |
+
except ValueError:
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
return datetime.fromisoformat(meta.date.split("T")[0])
|
| 27 |
+
except Exception:
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
def _regex_date(url: str):
|
| 31 |
+
|
| 32 |
+
for pat in _URL_DATE_PATS:
|
| 33 |
+
m = pat.search(url)
|
| 34 |
+
if m:
|
| 35 |
+
try:
|
| 36 |
+
return datetime(
|
| 37 |
+
int(m.group("y")), int(m.group("m")), int(m.group("d"))
|
| 38 |
+
)
|
| 39 |
+
except ValueError:
|
| 40 |
+
pass
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def is_after_start(url: str, start_ymd: str) -> bool:
|
| 45 |
+
"""
|
| 46 |
+
- start_ymd: 'YYYYMMDD'
|
| 47 |
+
"""
|
| 48 |
+
t0 = datetime.strptime(start_ymd, "%Y%m%d")
|
| 49 |
+
|
| 50 |
+
pub_dt = _meta_date(url)
|
| 51 |
+
|
| 52 |
+
if pub_dt is None:
|
| 53 |
+
pub_dt = _regex_date(url)
|
| 54 |
+
|
| 55 |
+
if pub_dt is None:
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
+
return pub_dt >= t0
|
system/ee.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import argparse
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import tiktoken
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
|
| 9 |
+
def gpt_4o(input_text):
|
| 10 |
+
client=OpenAI(api_key=os.environ.get("OAI"))
|
| 11 |
+
response = client.chat.completions.create(
|
| 12 |
+
model="gpt-4o",
|
| 13 |
+
messages=[
|
| 14 |
+
{"role": "user", "content": [{"type": "text", "text": input_text}]}
|
| 15 |
+
],
|
| 16 |
+
response_format={"type": "json_object"},
|
| 17 |
+
temperature=0,
|
| 18 |
+
max_tokens=4096,
|
| 19 |
+
top_p=0,
|
| 20 |
+
frequency_penalty=0,
|
| 21 |
+
presence_penalty=0
|
| 22 |
+
)
|
| 23 |
+
return response.choices[0].message.content
|
| 24 |
+
|
| 25 |
+
def run_gpt4_event_extraction(data_dir, max_tokens=100000):
|
| 26 |
+
|
| 27 |
+
all_info_path = os.path.join(data_dir, "all_info_with_txt.json")
|
| 28 |
+
output_dir = os.path.join(data_dir, "gpt4_event_extraction")
|
| 29 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 30 |
+
icl_path = hf_hub_download(
|
| 31 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 32 |
+
filename="icl.txt",
|
| 33 |
+
repo_type="dataset",
|
| 34 |
+
token=os.environ["HF_TOKEN"]
|
| 35 |
+
)
|
| 36 |
+
ICL = open(icl_path, "r").read()
|
| 37 |
+
all_info = open(all_info_path, "r").readlines()
|
| 38 |
+
|
| 39 |
+
enc = tiktoken.encoding_for_model("gpt-4o")
|
| 40 |
+
|
| 41 |
+
for i, line in enumerate(all_info):
|
| 42 |
+
ID = i
|
| 43 |
+
urls = []
|
| 44 |
+
results = []
|
| 45 |
+
|
| 46 |
+
data = json.loads(line)
|
| 47 |
+
docs = data["evidence"]
|
| 48 |
+
claim = data["claim"]
|
| 49 |
+
|
| 50 |
+
output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json")
|
| 51 |
+
if os.path.exists(output_path):
|
| 52 |
+
print(f"Already exist: {output_path}")
|
| 53 |
+
|
| 54 |
+
else:
|
| 55 |
+
|
| 56 |
+
for doc in tqdm(docs):
|
| 57 |
+
if doc["url"] in urls:
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
text = " ".join(doc["text"])
|
| 61 |
+
input_text = (
|
| 62 |
+
f"{ICL}\nNow please only summarize events that are useful for verifying the pledge '{claim}', and their dates in the JSON format.\n\nInput:\n\nTitle: {doc['metadata']['title']}\n"
|
| 63 |
+
f"Date: {doc['metadata']['date']}\nArticle: {text}\nPledge: {claim}\n\n"
|
| 64 |
+
f"Output:\n"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
urls.append(doc["url"])
|
| 68 |
+
text_tokens = enc.encode(input_text)
|
| 69 |
+
if len(text_tokens) > max_tokens:
|
| 70 |
+
input_text = enc.decode(text_tokens[:max_tokens])
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
output = gpt_4o(input_text)
|
| 74 |
+
# print(f"GPT-4o Response: {output}")
|
| 75 |
+
results.append({
|
| 76 |
+
"url": doc["url"],
|
| 77 |
+
"title": doc["metadata"]["title"],
|
| 78 |
+
"date": doc["metadata"]["date"],
|
| 79 |
+
"article": text,
|
| 80 |
+
"output": json.loads(output)
|
| 81 |
+
})
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"Error processing doc: {e}")
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 88 |
+
json.dump(results, f, ensure_ascii=False, indent=4)
|
| 89 |
+
|
| 90 |
+
return output_path
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
parser = argparse.ArgumentParser(description="Run GPT-4o event extraction")
|
| 94 |
+
parser.add_argument("--data_dir", type=str, required=True, help="Root data directory")
|
| 95 |
+
parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file")
|
| 96 |
+
parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input")
|
| 97 |
+
|
| 98 |
+
args = parser.parse_args()
|
| 99 |
+
|
| 100 |
+
run_gpt4_event_extraction(
|
| 101 |
+
base_dir=args.base_dir,
|
| 102 |
+
icl_path=args.icl_path,
|
| 103 |
+
max_tokens=args.max_tokens
|
| 104 |
+
)
|
system/generate_output.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import argparse
|
| 4 |
+
from system.html2lines import html2metadata
|
| 5 |
+
from lxml.etree import tostring
|
| 6 |
+
import lxml.etree
|
| 7 |
+
|
| 8 |
+
def process_manifesto_data_with_metadata(input_base_dir: str):
|
| 9 |
+
|
| 10 |
+
input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
|
| 11 |
+
output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")
|
| 12 |
+
|
| 13 |
+
url2text_dir = os.path.join(input_base_dir, "augmented_data_store")
|
| 14 |
+
|
| 15 |
+
with open(input_file_path, "r", encoding="utf-8") as f:
|
| 16 |
+
input_file = f.readlines()
|
| 17 |
+
|
| 18 |
+
out_file = open(output_file_path, "w", encoding="utf-8")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
i = 0
|
| 22 |
+
|
| 23 |
+
for id, line in enumerate(input_file):
|
| 24 |
+
line = json.loads(line)
|
| 25 |
+
claim = line["claim"]
|
| 26 |
+
QAs = line["top_50"]
|
| 27 |
+
new_line = {"claim": claim, "evidence": []}
|
| 28 |
+
|
| 29 |
+
json_path = os.path.join(url2text_dir, f"{id}.jsonl")
|
| 30 |
+
if not os.path.exists(json_path):
|
| 31 |
+
print(f"Warning: {json_path} not found")
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 35 |
+
try:
|
| 36 |
+
data_store = json.load(f)
|
| 37 |
+
except json.JSONDecodeError:
|
| 38 |
+
f.seek(0)
|
| 39 |
+
data_store = [json.loads(line) for line in f]
|
| 40 |
+
|
| 41 |
+
url_txt = {data["url"]: data["url2text"] for data in data_store}
|
| 42 |
+
|
| 43 |
+
URLs = []
|
| 44 |
+
for j, QA in enumerate(QAs):
|
| 45 |
+
newQA = QA.copy()
|
| 46 |
+
URL = QA["url"]
|
| 47 |
+
newQA["text"] = url_txt.get(URL, "")
|
| 48 |
+
|
| 49 |
+
if URL not in URLs:
|
| 50 |
+
try:
|
| 51 |
+
meta = html2metadata(URL)
|
| 52 |
+
if isinstance(meta, lxml.etree._Element):
|
| 53 |
+
meta = tostring(meta, encoding="unicode", pretty_print=True)
|
| 54 |
+
meta_save = {
|
| 55 |
+
"title": meta["title"],
|
| 56 |
+
"date": meta["date"]
|
| 57 |
+
}
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Metadata extraction failed for URL: {URL}, error: {e}")
|
| 60 |
+
meta_save = {
|
| 61 |
+
"title": "",
|
| 62 |
+
"date": ""
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
newQA["metadata"] = meta_save
|
| 67 |
+
new_line["evidence"].append(newQA)
|
| 68 |
+
|
| 69 |
+
out_file.write(json.dumps(new_line) + "\n")
|
| 70 |
+
|
| 71 |
+
out_file.close()
|
| 72 |
+
return output_file_path
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
system/hero_pipeline.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import subprocess
|
| 4 |
+
from huggingface_hub import hf_hub_download
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
def run_hero_reranking(pipeline_base_dir, suggestion_meta):
|
| 8 |
+
base_dir = f"{pipeline_base_dir}"
|
| 9 |
+
hero_dir = os.path.join(base_dir, "hero")
|
| 10 |
+
os.makedirs(hero_dir, exist_ok=True)
|
| 11 |
+
|
| 12 |
+
if suggestion_meta:
|
| 13 |
+
hyde_path = hf_hub_download(
|
| 14 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 15 |
+
filename="manifesto_icl_hyde_fc.json",
|
| 16 |
+
repo_type="dataset",
|
| 17 |
+
token=os.environ["HF_TOKEN"]
|
| 18 |
+
)
|
| 19 |
+
with open(hyde_path, "r", encoding="utf-8") as f:
|
| 20 |
+
all_hyde_data = json.load(f)
|
| 21 |
+
|
| 22 |
+
idx = suggestion_meta["index"]
|
| 23 |
+
single_hyde = [all_hyde_data[idx]]
|
| 24 |
+
save_path = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
| 25 |
+
with open(save_path, "w", encoding="utf-8") as f:
|
| 26 |
+
json.dump(single_hyde, f, indent=2)
|
| 27 |
+
|
| 28 |
+
hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
| 29 |
+
|
| 30 |
+
def safe_run(cmd, timeout=600):
|
| 31 |
+
try:
|
| 32 |
+
print(f"π Running: {' '.join(str(x) for x in cmd)}")
|
| 33 |
+
subprocess.run(cmd, check=True, timeout=timeout)
|
| 34 |
+
except subprocess.CalledProcessError as e:
|
| 35 |
+
print(f"[β ERROR] Subprocess failed: {e}")
|
| 36 |
+
if e.stderr:
|
| 37 |
+
print("[stderr]:", e.stderr.decode())
|
| 38 |
+
raise
|
| 39 |
+
except subprocess.TimeoutExpired:
|
| 40 |
+
print(f"[β TIMEOUT] Command timed out: {' '.join(cmd)}")
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
# Step 3.2: retrieval
|
| 44 |
+
print("π Step 3.2: Retrieval from knowledge store ...")
|
| 45 |
+
knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
|
| 46 |
+
retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")
|
| 47 |
+
|
| 48 |
+
if not os.path.exists(retrieval_output):
|
| 49 |
+
safe_run([
|
| 50 |
+
"python", "system/baseline/retrieval_optimized.py",
|
| 51 |
+
"--knowledge_store_dir", knowledge_store_dir,
|
| 52 |
+
"--target_data", hyde_output,
|
| 53 |
+
"--json_output", retrieval_output,
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
# Step 3.3: reranking
|
| 57 |
+
print("π·οΈ Step 3.3: Reranking retrieved evidence ...")
|
| 58 |
+
rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")
|
| 59 |
+
|
| 60 |
+
if not os.path.exists(rerank_output):
|
| 61 |
+
safe_run([
|
| 62 |
+
"python", "system/baseline/reranking_optimized.py",
|
| 63 |
+
"--target_data", retrieval_output,
|
| 64 |
+
"--json_output", rerank_output,
|
| 65 |
+
"--top_k", str(50),
|
| 66 |
+
])
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"hyde": hyde_output,
|
| 70 |
+
"retrieved": retrieval_output,
|
| 71 |
+
"reranked": rerank_output,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def run_hero_pipeline(pipeline_base_dir):
|
| 76 |
+
base_dir = f"{pipeline_base_dir}"
|
| 77 |
+
hero_dir = os.path.join(base_dir, "hero")
|
| 78 |
+
os.makedirs(hero_dir, exist_ok=True)
|
| 79 |
+
|
| 80 |
+
target_data = os.path.join(base_dir, "claim.json")
|
| 81 |
+
hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
| 82 |
+
|
| 83 |
+
def safe_run(cmd, timeout=600):
|
| 84 |
+
try:
|
| 85 |
+
print(f"π Running: {' '.join(cmd)}")
|
| 86 |
+
subprocess.run(cmd, check=True, timeout=timeout)
|
| 87 |
+
except subprocess.CalledProcessError as e:
|
| 88 |
+
print(f"[β ERROR] Subprocess failed: {e}")
|
| 89 |
+
if e.stderr:
|
| 90 |
+
print("[stderr]:", e.stderr.decode())
|
| 91 |
+
raise
|
| 92 |
+
except subprocess.TimeoutExpired:
|
| 93 |
+
print(f"[β TIMEOUT] Command timed out: {' '.join(cmd)}")
|
| 94 |
+
raise
|
| 95 |
+
|
| 96 |
+
# Step 3.1: hyde_fc_generation
|
| 97 |
+
if not os.path.exists(hyde_output):
|
| 98 |
+
print("π§ Step 3.1: HyDE ICL generation ...")
|
| 99 |
+
safe_run([
|
| 100 |
+
"python", "system/baseline/hyde_fc_generation_optimized.py",
|
| 101 |
+
"--target_data", target_data,
|
| 102 |
+
"--json_output", hyde_output
|
| 103 |
+
])
|
| 104 |
+
|
| 105 |
+
# Step 3.2: retrieval
|
| 106 |
+
print("π Step 3.2: Retrieval from knowledge store ...")
|
| 107 |
+
knowledge_store_dir = os.path.join(base_dir, "initial_data_store")
|
| 108 |
+
retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k.json")
|
| 109 |
+
|
| 110 |
+
if not os.path.exists(retrieval_output):
|
| 111 |
+
safe_run([
|
| 112 |
+
"python", "system/baseline/retrieval_optimized.py",
|
| 113 |
+
"--knowledge_store_dir", knowledge_store_dir,
|
| 114 |
+
"--target_data", hyde_output,
|
| 115 |
+
"--json_output", retrieval_output
|
| 116 |
+
])
|
| 117 |
+
|
| 118 |
+
# Step 3.3: reranking
|
| 119 |
+
print("π·οΈ Step 3.3: Reranking retrieved evidence ...")
|
| 120 |
+
rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k.json")
|
| 121 |
+
|
| 122 |
+
if not os.path.exists(rerank_output):
|
| 123 |
+
safe_run([
|
| 124 |
+
"python", "system/baseline/reranking_optimized.py",
|
| 125 |
+
"--target_data", retrieval_output,
|
| 126 |
+
"--json_output", rerank_output
|
| 127 |
+
])
|
| 128 |
+
|
| 129 |
+
# Step 3.4: question generation
|
| 130 |
+
print("β Step 3.4: Generating QA pairs ...")
|
| 131 |
+
reference_corpus = "system/baseline/train.json"
|
| 132 |
+
qa_output = os.path.join(hero_dir, "manifesto_icl_top_k_qa.json")
|
| 133 |
+
|
| 134 |
+
if not os.path.exists(qa_output):
|
| 135 |
+
safe_run([
|
| 136 |
+
"python", "system/baseline/question_generation_optimized.py",
|
| 137 |
+
"--reference_corpus", reference_corpus,
|
| 138 |
+
"--top_k_target_knowledge", rerank_output,
|
| 139 |
+
"--output_questions", qa_output,
|
| 140 |
+
"--model", "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
| 141 |
+
])
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
"hyde": hyde_output,
|
| 145 |
+
"retrieved": retrieval_output,
|
| 146 |
+
"reranked": rerank_output,
|
| 147 |
+
"qa_pairs": qa_output
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
|
system/html2lines.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from time import sleep
|
| 3 |
+
import trafilatura
|
| 4 |
+
from trafilatura.meta import reset_caches
|
| 5 |
+
from trafilatura.settings import DEFAULT_CONFIG
|
| 6 |
+
import spacy
|
| 7 |
+
from lxml.etree import tostring
|
| 8 |
+
import lxml.etree
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
import spacy
|
| 12 |
+
import subprocess
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
nlp = spacy.load("en_core_web_lg")
|
| 16 |
+
except OSError:
|
| 17 |
+
print("π Downloading spaCy model 'en_core_web_lg' ...")
|
| 18 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
|
| 19 |
+
nlp = spacy.load("en_core_web_lg")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
|
| 23 |
+
MIN_CHAR = 50
|
| 24 |
+
MAX_CHAR = 5000
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_page(url):
|
| 28 |
+
page = None
|
| 29 |
+
for _ in range(3):
|
| 30 |
+
try:
|
| 31 |
+
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
|
| 32 |
+
assert page is not None
|
| 33 |
+
print("Fetched " + url, file=sys.stderr)
|
| 34 |
+
break
|
| 35 |
+
except:
|
| 36 |
+
sleep(3)
|
| 37 |
+
return page
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def url2lines(url):
|
| 41 |
+
page = get_page(url)
|
| 42 |
+
|
| 43 |
+
if page is None:
|
| 44 |
+
return []
|
| 45 |
+
|
| 46 |
+
lines = html2lines(page)
|
| 47 |
+
return lines
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def line_correction(lines, max_size=100):
|
| 51 |
+
out_lines = []
|
| 52 |
+
for line in lines:
|
| 53 |
+
if len(line) < MIN_CHAR:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
if len(line) > max_size:
|
| 57 |
+
doc = nlp(
|
| 58 |
+
line[:MAX_CHAR]
|
| 59 |
+
) # We split lines into sentences, but for performance we take only the first 5k characters per line
|
| 60 |
+
stack = ""
|
| 61 |
+
for sent in doc.sents:
|
| 62 |
+
if len(stack) > 0:
|
| 63 |
+
stack += " "
|
| 64 |
+
stack += str(sent).strip()
|
| 65 |
+
if len(stack) > max_size:
|
| 66 |
+
out_lines.append(stack)
|
| 67 |
+
stack = ""
|
| 68 |
+
|
| 69 |
+
if (
|
| 70 |
+
len(stack) > MIN_CHAR
|
| 71 |
+
): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
|
| 72 |
+
out_lines.append(stack)
|
| 73 |
+
else:
|
| 74 |
+
out_lines.append(line)
|
| 75 |
+
|
| 76 |
+
return out_lines
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def html2lines(page):
|
| 80 |
+
out_lines = []
|
| 81 |
+
|
| 82 |
+
if len(page.strip()) == 0 or page is None:
|
| 83 |
+
return out_lines
|
| 84 |
+
|
| 85 |
+
text = trafilatura.extract(page, config=DEFAULT_CONFIG)
|
| 86 |
+
reset_caches()
|
| 87 |
+
|
| 88 |
+
if text is None:
|
| 89 |
+
return out_lines
|
| 90 |
+
|
| 91 |
+
return text.split(
|
| 92 |
+
"\n"
|
| 93 |
+
) # We just spit out the entire page, so need to reformat later.
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def html2metadata(url):
|
| 97 |
+
page = get_page(url)
|
| 98 |
+
metadata = trafilatura.extract_metadata(page)
|
| 99 |
+
return metadata.as_dict()
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
url = "https://www.bbc.co.uk/news/61407508"
|
| 103 |
+
metadata = html2metadata(url)
|
| 104 |
+
text = " ".join(html2lines(page))
|
| 105 |
+
print(metadata)
|
system/initial_searching.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import requests
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import spacy
|
| 9 |
+
import subprocess
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
nlp = spacy.load("en_core_web_sm")
|
| 13 |
+
except OSError:
|
| 14 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
|
| 15 |
+
nlp = spacy.load("en_core_web_sm")
|
| 16 |
+
|
| 17 |
+
def clean_keywords(text):
|
| 18 |
+
doc = nlp(text)
|
| 19 |
+
keywords = []
|
| 20 |
+
for chunk in doc.noun_chunks:
|
| 21 |
+
words = [token.text for token in chunk if not token.is_stop and token.is_alpha]
|
| 22 |
+
if words:
|
| 23 |
+
cleaned_phrase = " ".join(words)
|
| 24 |
+
if len(cleaned_phrase) > 2:
|
| 25 |
+
keywords.append(cleaned_phrase)
|
| 26 |
+
return list(set(keywords))
|
| 27 |
+
|
| 28 |
+
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
| 29 |
+
print(f"[SYSTEM] Calling Google Search API for: {query}")
|
| 30 |
+
sort = f"date:r:{start_date}:{end_date}"
|
| 31 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
| 32 |
+
params = {
|
| 33 |
+
"q": query,
|
| 34 |
+
"key": api_key,
|
| 35 |
+
"cx": search_engine_id,
|
| 36 |
+
"num": 10,
|
| 37 |
+
"sort": sort,
|
| 38 |
+
"cr": "countryUK",
|
| 39 |
+
"gl": "uk"
|
| 40 |
+
}
|
| 41 |
+
try:
|
| 42 |
+
response = requests.get(url, params=params)
|
| 43 |
+
response.raise_for_status()
|
| 44 |
+
return response.json().get("items", [])
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"[ERROR] Google Search Failed: {e}")
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
def save_tsv(file_path, claim_id, claim_text, url_list):
|
| 50 |
+
df = pd.DataFrame({
|
| 51 |
+
'ID': [claim_id] * len(url_list),
|
| 52 |
+
'String': ["claim"] * len(url_list),
|
| 53 |
+
'ListValue': url_list,
|
| 54 |
+
'query': [claim_text] * len(url_list)
|
| 55 |
+
})
|
| 56 |
+
df.to_csv(file_path, sep='\t', index=False, header=False)
|
| 57 |
+
|
| 58 |
+
def ensure_directory_exists(path):
|
| 59 |
+
dir_path = Path(path).expanduser().resolve().parent
|
| 60 |
+
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
|
| 61 |
+
raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
|
| 62 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 63 |
+
|
| 64 |
+
def run_initial_searching(claim_text, pipeline_base_dir, start_date, end_date, user_id, claim_id):
|
| 65 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
| 66 |
+
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
|
| 67 |
+
if not api_key or not search_engine_id:
|
| 68 |
+
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
|
| 69 |
+
|
| 70 |
+
base_dir = pipeline_base_dir
|
| 71 |
+
manifesto_json_file = os.path.join(base_dir,"claim.json")
|
| 72 |
+
tsv_file_path = os.path.join(base_dir,"initial_search_results.tsv")
|
| 73 |
+
|
| 74 |
+
ensure_directory_exists(tsv_file_path)
|
| 75 |
+
|
| 76 |
+
claim_record = {"claim_id": claim_id, "claim": claim_text}
|
| 77 |
+
# if manifesto_json_file.exists():
|
| 78 |
+
# with open(manifesto_json_file, "r") as f:
|
| 79 |
+
# records = json.load(f)
|
| 80 |
+
# else:
|
| 81 |
+
records = []
|
| 82 |
+
records.append(claim_record)
|
| 83 |
+
with open(manifesto_json_file, "w") as f:
|
| 84 |
+
json.dump(records, f, indent=1)
|
| 85 |
+
|
| 86 |
+
urls = []
|
| 87 |
+
results = google_search(f"{claim_text}", api_key, search_engine_id, start_date, end_date)
|
| 88 |
+
urls += [r["link"] for r in results if "link" in r]
|
| 89 |
+
keywords = clean_keywords(claim_text)
|
| 90 |
+
keyword_text = " ".join(keywords)
|
| 91 |
+
# for kw in keywords:
|
| 92 |
+
# results = google_search(kw, api_key, search_engine_id, start_date, end_date)
|
| 93 |
+
# urls += [r["link"] for r in results if "link" in r]
|
| 94 |
+
results = google_search(keyword_text, api_key, search_engine_id, start_date, end_date)
|
| 95 |
+
urls += [r["link"] for r in results if "link" in r]
|
| 96 |
+
urls = list(dict.fromkeys(urls))
|
| 97 |
+
|
| 98 |
+
save_tsv(str(tsv_file_path), claim_id, claim_text, urls)
|
| 99 |
+
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
|
| 100 |
+
return str(tsv_file_path), str(manifesto_json_file)
|
system/pledge_tracking.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import login
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import os, time
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
from system.initial_searching import run_initial_searching
|
| 7 |
+
from system.scraper import run_scraper
|
| 8 |
+
from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
|
| 9 |
+
from system.augmented_searching import run_augmented_searching
|
| 10 |
+
from system.generate_output import process_manifesto_data_with_metadata
|
| 11 |
+
from system.ee import run_gpt4_event_extraction
|
| 12 |
+
from system.process_time import extract_and_sort_events
|
| 13 |
+
import spacy
|
| 14 |
+
import subprocess
|
| 15 |
+
from huggingface_hub import hf_hub_download
|
| 16 |
+
import json
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
spacy.load("en_core_web_sm")
|
| 20 |
+
except OSError:
|
| 21 |
+
print("π Downloading en_core_web_sm model ...")
|
| 22 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
| 23 |
+
nlp = spacy.load("en_core_web_sm")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def count_total_events(output_path):
|
| 27 |
+
with open(output_path, "r", encoding="utf-8") as f:
|
| 28 |
+
results = json.load(f)
|
| 29 |
+
|
| 30 |
+
total_events = 0
|
| 31 |
+
for result in results:
|
| 32 |
+
total_events+= len(result["output"]["events"])
|
| 33 |
+
|
| 34 |
+
print(f"{total_events} events in total")
|
| 35 |
+
return total_events
|
| 36 |
+
|
| 37 |
+
def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
|
| 38 |
+
pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
|
| 39 |
+
os.makedirs(pipeline_base_dir, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
step_id=1
|
| 42 |
+
|
| 43 |
+
# Step 1: Google ζη΄’
|
| 44 |
+
if suggestion_meta==None:
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
print("π Step 1: Initial searching ...")
|
| 48 |
+
initial_tsv_file, claim_json_path = run_initial_searching(
|
| 49 |
+
claim_text=f"{pledge_author} : {claim} ({pledge_date})",
|
| 50 |
+
# pledge_author=pledge_author,
|
| 51 |
+
pipeline_base_dir=pipeline_base_dir,
|
| 52 |
+
start_date=start_date,
|
| 53 |
+
end_date="",
|
| 54 |
+
user_id=user_id,
|
| 55 |
+
claim_id=0,
|
| 56 |
+
)
|
| 57 |
+
with open(initial_tsv_file, "r", encoding="utf-8") as f:
|
| 58 |
+
line_count = sum(1 for line in f)
|
| 59 |
+
if update_fn:
|
| 60 |
+
update_fn(step_id, f"{line_count} URLs are retrieved")
|
| 61 |
+
step_id+=1
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
print("π Step 2: Scraping URLs ...")
|
| 65 |
+
initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
|
| 66 |
+
os.makedirs(initial_data_store_dir, exist_ok=True)
|
| 67 |
+
initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
|
| 68 |
+
run_scraper(initial_tsv_file, initial_scraped_output_path)
|
| 69 |
+
|
| 70 |
+
with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
|
| 71 |
+
line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
|
| 72 |
+
if update_fn:
|
| 73 |
+
update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
|
| 74 |
+
step_id+=1
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
print("π§ Step 3: HerO processing ...")
|
| 78 |
+
hero_output_dir = os.path.join(pipeline_base_dir, "hero")
|
| 79 |
+
os.makedirs(hero_output_dir, exist_ok=True)
|
| 80 |
+
run_hero_pipeline(pipeline_base_dir)
|
| 81 |
+
|
| 82 |
+
qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")
|
| 83 |
+
|
| 84 |
+
with open(qa_file_path, "r", encoding="utf-8") as f:
|
| 85 |
+
questions = {line["question"] for line in json.load(f)["evidence"]}
|
| 86 |
+
questions = list(questions)
|
| 87 |
+
line_count = len(questions)
|
| 88 |
+
if update_fn:
|
| 89 |
+
update_fn(step_id, f"{line_count} relevant queries are generated, for example:\n"
|
| 90 |
+
f" 1. {questions[0]}\n"
|
| 91 |
+
f" 2. {questions[1]}\n"
|
| 92 |
+
f" 3. {questions[2]}\n"
|
| 93 |
+
f" 4. {questions[3]}\n"
|
| 94 |
+
f" 5. {questions[4]}")
|
| 95 |
+
step_id+=1
|
| 96 |
+
|
| 97 |
+
else:
|
| 98 |
+
claim_json_path = None
|
| 99 |
+
initial_scraped_output_path = None
|
| 100 |
+
initial_tsv_file = None
|
| 101 |
+
hero_output_dir = None
|
| 102 |
+
qa_file_path = hf_hub_download(
|
| 103 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 104 |
+
filename="manifesto_with_QA_icl_top_k_qa.json",
|
| 105 |
+
repo_type="dataset",
|
| 106 |
+
token=os.environ["HF_TOKEN"]
|
| 107 |
+
)
|
| 108 |
+
idx = suggestion_meta["index"]
|
| 109 |
+
qa_lines = open(f"{qa_file_path}","r").readlines()[idx]
|
| 110 |
+
questions = {line["question"] for line in json.loads(qa_lines)["evidence"]}
|
| 111 |
+
questions = list(questions)
|
| 112 |
+
line_count = len(questions)
|
| 113 |
+
if update_fn:
|
| 114 |
+
update_fn(step_id, f"relevant queries are generated, for example:\n"
|
| 115 |
+
f" 1. {questions[0]}\n"
|
| 116 |
+
f" 2. {questions[1]}\n"
|
| 117 |
+
f" 3. {questions[2]}\n"
|
| 118 |
+
f" 4. {questions[3]}\n"
|
| 119 |
+
f" 5. {questions[4]}")
|
| 120 |
+
step_id+=1
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
augmented_tsv_file = run_augmented_searching(
|
| 124 |
+
qa_file=qa_file_path,
|
| 125 |
+
pledge_author=pledge_author,
|
| 126 |
+
pledge_date=pledge_date,
|
| 127 |
+
pipeline_base_dir=pipeline_base_dir,
|
| 128 |
+
start_date=start_date,
|
| 129 |
+
suggestion_meta=suggestion_meta,
|
| 130 |
+
end_date="",
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
with open(augmented_tsv_file, "r", encoding="utf-8") as f:
|
| 136 |
+
line_count = sum(1 for line in f)
|
| 137 |
+
if update_fn:
|
| 138 |
+
update_fn(step_id, f"{line_count} URLs are retrieved")
|
| 139 |
+
step_id+=1
|
| 140 |
+
except Exception as e:
|
| 141 |
+
if update_fn:
|
| 142 |
+
update_fn(step_id, f"β run_augmented_searching failed: {e}")
|
| 143 |
+
raise
|
| 144 |
+
|
| 145 |
+
augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
|
| 146 |
+
os.makedirs(augmented_data_store_dir, exist_ok=True)
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
|
| 150 |
+
run_scraper(augmented_tsv_file, augmented_scraped_output_path)
|
| 151 |
+
|
| 152 |
+
with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
|
| 153 |
+
line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
|
| 154 |
+
if update_fn:
|
| 155 |
+
update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
|
| 156 |
+
step_id+=1
|
| 157 |
+
except Exception as e:
|
| 158 |
+
if update_fn:
|
| 159 |
+
update_fn(step_id, f"β run_scraper failed: {e}")
|
| 160 |
+
raise
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
run_hero_reranking(pipeline_base_dir, suggestion_meta)
|
| 165 |
+
|
| 166 |
+
meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
|
| 167 |
+
all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
|
| 168 |
+
unique_urls = set()
|
| 169 |
+
with open(all_info_path, "r", encoding="utf-8") as f:
|
| 170 |
+
for line in f:
|
| 171 |
+
data = json.loads(line)
|
| 172 |
+
docs = data.get("evidence", [])
|
| 173 |
+
for doc in docs:
|
| 174 |
+
if "url" in doc:
|
| 175 |
+
unique_urls.add(doc["url"])
|
| 176 |
+
if update_fn:
|
| 177 |
+
update_fn(step_id, f"{len(unique_urls)} documents are selected")
|
| 178 |
+
step_id+=1
|
| 179 |
+
except Exception as e:
|
| 180 |
+
if update_fn:
|
| 181 |
+
update_fn(step_id, f"β run_hero_reranking failed: {e}")
|
| 182 |
+
raise
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
|
| 186 |
+
|
| 187 |
+
events_num = count_total_events(extracted_event_path)
|
| 188 |
+
|
| 189 |
+
if update_fn:
|
| 190 |
+
update_fn(step_id, f"{events_num} events are extracted from those documents.")
|
| 191 |
+
step_id+=1
|
| 192 |
+
except Exception as e:
|
| 193 |
+
if update_fn:
|
| 194 |
+
update_fn(step_id, f"β Event extraction failed: {e}")
|
| 195 |
+
raise
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
print("π
Sorting events temporally ...")
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
sorted_events = extract_and_sort_events(
|
| 202 |
+
data_dir=pipeline_base_dir,
|
| 203 |
+
pledge_date=pledge_date,
|
| 204 |
+
pledge_author=pledge_author,
|
| 205 |
+
claim=claim,
|
| 206 |
+
suggestion_meta=suggestion_meta
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
df = pd.DataFrame(sorted_events)
|
| 210 |
+
sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
|
| 211 |
+
df.to_excel(sorted_event_path, index=False)
|
| 212 |
+
print(sorted_event_path)
|
| 213 |
+
|
| 214 |
+
if update_fn:
|
| 215 |
+
update_fn(step_id, "All done!")
|
| 216 |
+
step_id += 1
|
| 217 |
+
|
| 218 |
+
return {
|
| 219 |
+
"claim_json": claim_json_path,
|
| 220 |
+
"initial_scraped_jsonl": initial_scraped_output_path,
|
| 221 |
+
"initial_tsv_file": initial_tsv_file,
|
| 222 |
+
"hero_dir": hero_output_dir,
|
| 223 |
+
"augmented_scraped_jsonl": augmented_scraped_output_path,
|
| 224 |
+
"augmented_tsv_file": augmented_tsv_file,
|
| 225 |
+
"meta_data_dir": meta_data_dir,
|
| 226 |
+
"unsorted_events": extracted_event_path,
|
| 227 |
+
"sorted_events": sorted_event_path,
|
| 228 |
+
"step_id": step_id
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == "__main__":
|
| 233 |
+
start = time.time()
|
| 234 |
+
|
| 235 |
+
if os.environ.get("HF_TOKEN"):
|
| 236 |
+
login(token=os.environ["HF_TOKEN"])
|
| 237 |
+
else:
|
| 238 |
+
print("No Hugging Face token found in environment variable HF_TOKEN.")
|
| 239 |
+
|
| 240 |
+
claim = "βWe will support families with children by introducing free breakfast clubs in every primary schoolβ"
|
| 241 |
+
start_date = "20250504"
|
| 242 |
+
timestamp = "xxxxx"
|
| 243 |
+
user_id = "xxx"
|
| 244 |
+
|
| 245 |
+
outputs = run_pipeline(claim, time_start, timestamp, user_id)
|
| 246 |
+
print("π― Pipeline finished. Outputs:", outputs)
|
| 247 |
+
print(f"β±οΈ Total time: {time.time() - start:.2f} seconds")
|
system/process_time.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import datetime
|
| 3 |
+
import re
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os, argparse
|
| 6 |
+
import random
|
| 7 |
+
import csv
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
from huggingface_hub import hf_hub_download
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def gpt_4o_useful(input):
|
| 16 |
+
client=OpenAI(api_key=os.environ.get("OAI"))
|
| 17 |
+
response = client.chat.completions.create(
|
| 18 |
+
model="gpt-4o",
|
| 19 |
+
messages=[
|
| 20 |
+
{
|
| 21 |
+
"role": "user",
|
| 22 |
+
"content": [
|
| 23 |
+
{
|
| 24 |
+
"type": "text",
|
| 25 |
+
"text": input
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
}
|
| 29 |
+
],
|
| 30 |
+
response_format={"type": "text"},
|
| 31 |
+
temperature=0.0000000001,
|
| 32 |
+
max_tokens=4096,
|
| 33 |
+
top_p=0,
|
| 34 |
+
frequency_penalty=0,
|
| 35 |
+
presence_penalty=0,
|
| 36 |
+
logprobs=True
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
text = response.choices[0].message.content
|
| 40 |
+
|
| 41 |
+
if response.choices[0].logprobs and response.choices[0].logprobs.content:
|
| 42 |
+
first_token_logprob = response.choices[0].logprobs.content[0]
|
| 43 |
+
token = first_token_logprob.token
|
| 44 |
+
logprob = first_token_logprob.logprob
|
| 45 |
+
else:
|
| 46 |
+
token = None
|
| 47 |
+
logprob = None
|
| 48 |
+
|
| 49 |
+
return text, token, logprob
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get_ICL(data, top_k=None):
|
| 54 |
+
|
| 55 |
+
ICL =""
|
| 56 |
+
if top_k == None:
|
| 57 |
+
data = data
|
| 58 |
+
else:
|
| 59 |
+
# print(data)
|
| 60 |
+
data = data[:top_k]
|
| 61 |
+
for line in data:
|
| 62 |
+
# line = json.loads(line)
|
| 63 |
+
pledge = line["pledge"]
|
| 64 |
+
event = line["event_description"]
|
| 65 |
+
time = line["event_date"]
|
| 66 |
+
input=f"Pledge: {pledge}\nEvent Summary: {event} (Event Date: {time})\nIs this event summary useful to track the fulfilment of this pledge"
|
| 67 |
+
input = input.strip()
|
| 68 |
+
output = line["label"].strip()
|
| 69 |
+
ICL = f"{ICL}Input: {input}\nOutput: {output}\n\n"
|
| 70 |
+
return ICL
|
| 71 |
+
|
| 72 |
+
def load_json(file_path):
|
| 73 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 74 |
+
data = json.load(f)
|
| 75 |
+
return data
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=None):
|
| 79 |
+
|
| 80 |
+
if suggestion_meta:
|
| 81 |
+
# print(ICL_id)
|
| 82 |
+
|
| 83 |
+
train_data = [line for line in train_data if str(line.get("pledge_id")) == str(ICL_id)]
|
| 84 |
+
|
| 85 |
+
else:
|
| 86 |
+
random.seed(42)
|
| 87 |
+
random.shuffle(train_data)
|
| 88 |
+
|
| 89 |
+
ICL = get_ICL(train_data, top_k=50)
|
| 90 |
+
# print(ICL)
|
| 91 |
+
input = f"{instruction}\nBelow are examples:\n\n{ICL}Now, please assign a label for the below instance.\nInput: {test_instance}\nOutput:"
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
text, tokens, logprobs = gpt_4o_useful(input)
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(e)
|
| 97 |
+
tokens = None
|
| 98 |
+
logprobs = None
|
| 99 |
+
|
| 100 |
+
return tokens, logprobs
|
| 101 |
+
|
| 102 |
+
def extract_columns_to_dict(file_path, delimiter='\t'):
|
| 103 |
+
|
| 104 |
+
data_dict = {}
|
| 105 |
+
|
| 106 |
+
with open(file_path, mode='r', encoding='utf-8') as file:
|
| 107 |
+
reader = csv.reader(file, delimiter=delimiter)
|
| 108 |
+
for row in reader:
|
| 109 |
+
if len(row) >= 4:
|
| 110 |
+
key = row[2]
|
| 111 |
+
value = row[3]
|
| 112 |
+
data_dict[key] = value
|
| 113 |
+
|
| 114 |
+
return data_dict
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
import datetime
|
| 118 |
+
import re
|
| 119 |
+
|
| 120 |
+
def parse_date(date_str):
|
| 121 |
+
if not date_str:
|
| 122 |
+
return None, date_str
|
| 123 |
+
date_str = date_str.strip()
|
| 124 |
+
|
| 125 |
+
# Case 1: YYYY-MM-DD
|
| 126 |
+
try:
|
| 127 |
+
return datetime.datetime.strptime(date_str, "%Y-%m-%d"), date_str
|
| 128 |
+
except ValueError:
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
# Case 2: Relative date
|
| 132 |
+
match = re.search(r'(.*) \(relative to (\d{4}-\d{2}-\d{2})\)', date_str)
|
| 133 |
+
if match:
|
| 134 |
+
reference = datetime.datetime.strptime(match.group(2), "%Y-%m-%d")
|
| 135 |
+
relative_term = match.group(1).strip().lower()
|
| 136 |
+
if relative_term == "last month":
|
| 137 |
+
target_date = reference - datetime.timedelta(days=30)
|
| 138 |
+
elif relative_term == "yesterday":
|
| 139 |
+
target_date = reference - datetime.timedelta(days=1)
|
| 140 |
+
elif relative_term == "last week":
|
| 141 |
+
target_date = reference - datetime.timedelta(days=7)
|
| 142 |
+
elif relative_term == "this week":
|
| 143 |
+
target_date = reference
|
| 144 |
+
else:
|
| 145 |
+
return None, date_str
|
| 146 |
+
return target_date, date_str
|
| 147 |
+
|
| 148 |
+
# Case 3: YYYY
|
| 149 |
+
match = re.fullmatch(r'(\d{4})', date_str)
|
| 150 |
+
if match:
|
| 151 |
+
year = int(match.group(1))
|
| 152 |
+
return datetime.datetime(year, 1, 1), date_str
|
| 153 |
+
|
| 154 |
+
# Case 4: Month YYYY
|
| 155 |
+
match = re.fullmatch(r'(\w+) (\d{4})', date_str)
|
| 156 |
+
if match:
|
| 157 |
+
try:
|
| 158 |
+
target_date = datetime.datetime.strptime(date_str, "%B %Y")
|
| 159 |
+
return target_date, date_str
|
| 160 |
+
except ValueError:
|
| 161 |
+
return None, date_str
|
| 162 |
+
|
| 163 |
+
# Case 5: YYYY-QX
|
| 164 |
+
match = re.fullmatch(r'(\d{4})-Q(\d)', date_str)
|
| 165 |
+
if match:
|
| 166 |
+
year, quarter = int(match.group(1)), int(match.group(2))
|
| 167 |
+
month = (quarter - 1) * 3 + 1
|
| 168 |
+
return datetime.datetime(year, month, 1), date_str
|
| 169 |
+
|
| 170 |
+
# Case 6: YYYY Season
|
| 171 |
+
match = re.fullmatch(r'(\d{4}) (Spring|Summer|Autumn|Fall|Winter)', date_str, re.IGNORECASE)
|
| 172 |
+
if match:
|
| 173 |
+
year = int(match.group(1))
|
| 174 |
+
season_map = {"spring": 3, "summer": 6, "autumn": 9, "fall": 9, "winter": 12}
|
| 175 |
+
month = season_map[match.group(2).lower()]
|
| 176 |
+
return datetime.datetime(year, month, 1), date_str
|
| 177 |
+
|
| 178 |
+
return None, date_str
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggestion_meta):
|
| 182 |
+
|
| 183 |
+
events = []
|
| 184 |
+
|
| 185 |
+
# url_path = os.path.join(data_dir, "augmented_search_results.tsv")
|
| 186 |
+
# url_query_dict = extract_columns_to_dict(file_path=url_path, delimiter='\t')
|
| 187 |
+
|
| 188 |
+
pledge = claim.strip()
|
| 189 |
+
|
| 190 |
+
file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
|
| 191 |
+
gpt4_results_json = load_json(file_path)
|
| 192 |
+
|
| 193 |
+
# print(gpt4_results_json)
|
| 194 |
+
train_file_path = hf_hub_download(
|
| 195 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 196 |
+
filename="train_useful.json",
|
| 197 |
+
repo_type="dataset",
|
| 198 |
+
token=os.environ["HF_TOKEN"]
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
with open(train_file_path, "r", encoding="utf-8") as f:
|
| 202 |
+
train_data = json.load(f)
|
| 203 |
+
# print(train_data[0])
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
instruction_path = hf_hub_download(
|
| 208 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 209 |
+
filename="instruction.txt",
|
| 210 |
+
repo_type="dataset",
|
| 211 |
+
token=os.environ["HF_TOKEN"]
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
instruction = open(instruction_path, "r").read()
|
| 215 |
+
|
| 216 |
+
map_file_path = hf_hub_download(
|
| 217 |
+
repo_id="PledgeTracker/demo_feedback",
|
| 218 |
+
filename="mapping.txt",
|
| 219 |
+
repo_type="dataset",
|
| 220 |
+
token=os.environ["HF_TOKEN"]
|
| 221 |
+
)
|
| 222 |
+
mapping_f = open(map_file_path, "r").readlines()
|
| 223 |
+
mapping = {}
|
| 224 |
+
|
| 225 |
+
for map_id, line in enumerate(mapping_f):
|
| 226 |
+
mapping[map_id] = int(line.strip())
|
| 227 |
+
|
| 228 |
+
ICL_id = None
|
| 229 |
+
if suggestion_meta:
|
| 230 |
+
try:
|
| 231 |
+
idx = int(suggestion_meta["index"])
|
| 232 |
+
ICL_id = mapping.get(idx)
|
| 233 |
+
print(f"[Suggestion] index: {idx} β pledge_id: {ICL_id}")
|
| 234 |
+
except Exception as e:
|
| 235 |
+
print(f"[Mapping error]: {e}")
|
| 236 |
+
|
| 237 |
+
for doc in gpt4_results_json:
|
| 238 |
+
mete_date = doc["date"]
|
| 239 |
+
for event in doc.get("output", {}).get("events", []):
|
| 240 |
+
parsed_date, original_date = parse_date(event["date"])
|
| 241 |
+
|
| 242 |
+
if parsed_date:
|
| 243 |
+
parsed_date_str = parsed_date.strftime("%Y-%m-%d")
|
| 244 |
+
if parsed_date_str != mete_date:
|
| 245 |
+
event_date_and_pub_date = f"{parsed_date_str} ({mete_date})"
|
| 246 |
+
else:
|
| 247 |
+
event_date_and_pub_date = parsed_date_str
|
| 248 |
+
|
| 249 |
+
test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful to track the fulfilment of this pledge"
|
| 250 |
+
|
| 251 |
+
label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)
|
| 252 |
+
|
| 253 |
+
URL = doc["url"]
|
| 254 |
+
events.append({
|
| 255 |
+
"date": original_date,
|
| 256 |
+
"event date (publication date if different)": event_date_and_pub_date,
|
| 257 |
+
"event": event["event"],
|
| 258 |
+
"url": URL,
|
| 259 |
+
"label": label,
|
| 260 |
+
"confident": score,
|
| 261 |
+
})
|
| 262 |
+
|
| 263 |
+
events.sort(key=lambda x: parse_date(x["date"])[0], reverse=True)
|
| 264 |
+
return events
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
|
system/scraper.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 2 |
+
import os
|
| 3 |
+
import csv
|
| 4 |
+
import json
|
| 5 |
+
import fitz
|
| 6 |
+
import time
|
| 7 |
+
import requests
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from time import sleep
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from system.html2lines import url2lines, line_correction, html2metadata
|
| 12 |
+
|
| 13 |
+
MAX_RETRIES = 3
|
| 14 |
+
TIMEOUT = 5 # seconds
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def scrape_text_from_url(url, temp_name):
|
| 18 |
+
response = None
|
| 19 |
+
for attempt in range(MAX_RETRIES):
|
| 20 |
+
try:
|
| 21 |
+
response = requests.get(url, timeout=TIMEOUT)
|
| 22 |
+
break
|
| 23 |
+
except requests.RequestException:
|
| 24 |
+
if attempt < MAX_RETRIES - 1:
|
| 25 |
+
sleep(3)
|
| 26 |
+
|
| 27 |
+
if response is None or response.status_code == 503:
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
if url.endswith(".pdf"):
|
| 31 |
+
pdf_dir = Path("/tmp/pdf_dir")
|
| 32 |
+
pdf_dir.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
pdf_path = pdf_dir / f"{temp_name}.pdf"
|
| 34 |
+
with open(pdf_path, "wb") as f:
|
| 35 |
+
f.write(response.content)
|
| 36 |
+
|
| 37 |
+
extracted_text = ""
|
| 38 |
+
doc = fitz.open(str(pdf_path))
|
| 39 |
+
for page in doc:
|
| 40 |
+
extracted_text += page.get_text() or ""
|
| 41 |
+
|
| 42 |
+
return line_correction(extracted_text.split("\n"))
|
| 43 |
+
|
| 44 |
+
return line_correction(url2lines(url))
|
| 45 |
+
|
| 46 |
+
def process_row(row, claim_id):
|
| 47 |
+
try:
|
| 48 |
+
url = row[2]
|
| 49 |
+
json_data = {
|
| 50 |
+
"claim_id": claim_id,
|
| 51 |
+
"type": row[1],
|
| 52 |
+
"query": row[3],
|
| 53 |
+
"url": url,
|
| 54 |
+
"url2text": scrape_text_from_url(url, claim_id),
|
| 55 |
+
"metadata": {}
|
| 56 |
+
}
|
| 57 |
+
meta = html2metadata(url)
|
| 58 |
+
json_data["metadata"] = {
|
| 59 |
+
"title": meta.get("title"),
|
| 60 |
+
"date": meta.get("date")
|
| 61 |
+
}
|
| 62 |
+
return json_data
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"[WARN] Failed to scrape {row[2]}: {e}")
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
def run_scraper(tsv_file_path: str, output_jsonl_path: str, max_workers: int = 10):
|
| 68 |
+
claim_id = Path(tsv_file_path).stem
|
| 69 |
+
output_jsonl_path = Path(output_jsonl_path)
|
| 70 |
+
output_jsonl_path.parent.mkdir(parents=True, exist_ok=True)
|
| 71 |
+
|
| 72 |
+
if output_jsonl_path.exists():
|
| 73 |
+
print(f"[INFO] Skipping processing as output file already exists: {output_jsonl_path}")
|
| 74 |
+
return str(output_jsonl_path)
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
df = pd.read_csv(tsv_file_path, sep="\t", header=None)
|
| 78 |
+
print("[INFO] Data loaded successfully with Pandas.")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
raise RuntimeError(f"[ERROR] Failed to load TSV: {e}")
|
| 81 |
+
|
| 82 |
+
results = []
|
| 83 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 84 |
+
futures = [executor.submit(process_row, row, claim_id) for _, row in df.iterrows()]
|
| 85 |
+
for future in as_completed(futures):
|
| 86 |
+
result = future.result()
|
| 87 |
+
if result:
|
| 88 |
+
results.append(result)
|
| 89 |
+
|
| 90 |
+
with open(output_jsonl_path, "w", encoding="utf-8") as json_file:
|
| 91 |
+
for item in results:
|
| 92 |
+
json_file.write(json.dumps(item, ensure_ascii=False) + "\n")
|
| 93 |
+
|
| 94 |
+
print(f"[SYSTEM] Output saved to {output_jsonl_path}")
|
| 95 |
+
return str(output_jsonl_path)
|
test.html
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 6 |
+
<title>Pledge Tracker β Demo</title>
|
| 7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 8 |
+
</head>
|
| 9 |
+
<body class="bg-gray-50 text-gray-800">
|
| 10 |
+
<header class="bg-white shadow py-4 sticky top-0 z-10">
|
| 11 |
+
<div class="container mx-auto flex items-center justify-between px-4">
|
| 12 |
+
<div class="flex items-center gap-2">
|
| 13 |
+
<span class="text-2xl font-bold text-purple-600">π€</span>
|
| 14 |
+
<span class="font-semibold text-lg">Pledge Tracking</span>
|
| 15 |
+
</div>
|
| 16 |
+
<nav class="hidden md:flex gap-6 font-medium">
|
| 17 |
+
<a class="hover:text-purple-600" href="#eval-response">Track Your Pledge</a>
|
| 18 |
+
<a class="hover:text-purple-600" href="#about">About</a>
|
| 19 |
+
</nav>
|
| 20 |
+
</div>
|
| 21 |
+
</header>
|
| 22 |
+
|
| 23 |
+
<section class="py-16 bg-gradient-to-r from-purple-50 to-purple-50 text-center">
|
| 24 |
+
<div class="container mx-auto px-4 max-w-3xl">
|
| 25 |
+
<h1 class="text-3xl md:text-4xl font-extrabold mb-6">
|
| 26 |
+
<span style="font-variant: small-caps; font-weight: bold;">PledgeTracker</span>: A System for Monitoring the Fulfilment of Pledges
|
| 27 |
+
</h1>
|
| 28 |
+
<div class="text-lg text-gray-600 leading-relaxed space-y-4 text-justify">
|
| 29 |
+
<p>
|
| 30 |
+
<span style="font-variant: small-caps;">PledgeTracker</span> is a system to monitor the fulfilment of political pledges. As part of this study, we will collect your inputs to help evaluate and improve the system. We may also collect your feedback if you submit it via the feedback form. No personal information will be collected, and all data will be anonymised and stored securely. By using the system, you agree to participate in this study under these conditions.
|
| 31 |
+
</p>
|
| 32 |
+
<p class="text-center">
|
| 33 |
+
Please contact
|
| 34 |
+
<a href="mailto:[email protected]" class="text-purple-600 underline">Andreas Vlachos</a>
|
| 35 |
+
and
|
| 36 |
+
<a href="mailto:[email protected]" class="text-purple-600 underline">Yulong Chen</a>
|
| 37 |
+
if you have any concerns.
|
| 38 |
+
</p>
|
| 39 |
+
</div>
|
| 40 |
+
</div>
|
| 41 |
+
</section>
|
| 42 |
+
|
| 43 |
+
<section id="eval-response" class="py-12">
|
| 44 |
+
<div class="container mx-auto px-4 max-w-4xl">
|
| 45 |
+
<!-- <h2 class="text-2xl font-bold mb-6">Track Manifesto Pledge</h2> -->
|
| 46 |
+
<label for="claim" class="block text-sm font-medium mb-2">
|
| 47 |
+
Please enter the pledge:
|
| 48 |
+
</label>
|
| 49 |
+
<textarea
|
| 50 |
+
id="claim"
|
| 51 |
+
class="w-full border rounded-lg p-3 h-40 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
| 52 |
+
placeholder="For example: 'We will support families with children by introducing free breakfast clubs in every primary school...'"
|
| 53 |
+
></textarea>
|
| 54 |
+
|
| 55 |
+
<div id="similar-suggestions" class="mt-3 text-sm text-gray-600 hidden"></div>
|
| 56 |
+
|
| 57 |
+
<div class="mt-4">
|
| 58 |
+
<label for="pledge-date" class="block text-sm font-medium mb-2">
|
| 59 |
+
When was this pledge made?
|
| 60 |
+
</label>
|
| 61 |
+
<div class="grid grid-cols-[1fr_auto] items-center gap-2">
|
| 62 |
+
<input
|
| 63 |
+
type="date"
|
| 64 |
+
id="pledge-date"
|
| 65 |
+
class="w-full border rounded-lg p-2"
|
| 66 |
+
/>
|
| 67 |
+
<button
|
| 68 |
+
onclick="setDefaultDate()"
|
| 69 |
+
type="button"
|
| 70 |
+
class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
| 71 |
+
>
|
| 72 |
+
Use default: 4th Jul 2024
|
| 73 |
+
</button>
|
| 74 |
+
</div>
|
| 75 |
+
<div id="date-warning" class="text-sm text-red-600 mt-1 hidden">
|
| 76 |
+
Please select a date or click the button to use the default.
|
| 77 |
+
</div>
|
| 78 |
+
</div>
|
| 79 |
+
|
| 80 |
+
<div class="mt-4">
|
| 81 |
+
<label for="pledge-author" class="block text-sm font-medium mb-2">
|
| 82 |
+
Who made this pledge?
|
| 83 |
+
</label>
|
| 84 |
+
<div class="grid grid-cols-[1fr_auto] items-center gap-2">
|
| 85 |
+
<input
|
| 86 |
+
type="text"
|
| 87 |
+
id="pledge-author"
|
| 88 |
+
class="w-full border rounded-lg p-2"
|
| 89 |
+
placeholder="Enter the name of the party or person"
|
| 90 |
+
/>
|
| 91 |
+
<button
|
| 92 |
+
onclick="setDefaultAuthor()"
|
| 93 |
+
type="button"
|
| 94 |
+
class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
| 95 |
+
>
|
| 96 |
+
Use default: Labour
|
| 97 |
+
</button>
|
| 98 |
+
</div>
|
| 99 |
+
<div id="author-warning" class="text-sm text-red-600 mt-1 hidden">
|
| 100 |
+
Please enter a speaker or click the button to use the default.
|
| 101 |
+
</div>
|
| 102 |
+
</div>
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
<label for="time-range" class="block text-sm font-medium mt-4 mb-2">
|
| 107 |
+
Please select a time range:
|
| 108 |
+
</label>
|
| 109 |
+
<select id="time-range" class="w-full border rounded-lg p-2">
|
| 110 |
+
<option value="week">Past one week</option>
|
| 111 |
+
<option value="month">Past one month</option>
|
| 112 |
+
<!-- <option value="year">From when the pledge was made</option> -->
|
| 113 |
+
<option value="since_pledge_date">From when the pledge was made</option>
|
| 114 |
+
</select>
|
| 115 |
+
|
| 116 |
+
<button
|
| 117 |
+
id="check"
|
| 118 |
+
class="mt-4 px-6 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
| 119 |
+
>
|
| 120 |
+
Let's track!
|
| 121 |
+
</button>
|
| 122 |
+
|
| 123 |
+
<div id="progress" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
|
| 124 |
+
<h3 class="font-semibold mb-2">System Progress</h3>
|
| 125 |
+
<div id="status" class="text-sm text-gray-800 font-normal leading-relaxed"></div>
|
| 126 |
+
</div>
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
<div id="result" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
|
| 130 |
+
<h3 class="font-semibold mb-2">Result</h3>
|
| 131 |
+
<p class="text-gray-700"></p>
|
| 132 |
+
</div>
|
| 133 |
+
</div>
|
| 134 |
+
</section>
|
| 135 |
+
|
| 136 |
+
<section id="about" class="py-12">
|
| 137 |
+
<div class="container mx-auto px-4 max-w-4xl">
|
| 138 |
+
<h2 class="text-2xl font-bold mb-6">About</h2>
|
| 139 |
+
<p class="text-gray-700 leading-relaxed">
|
| 140 |
+
<span style="font-variant: small-caps;">PledgeTracker</span> is a research prototype developed to support the monitoring of political pledge fulfilment.
|
| 141 |
+
This demo is developed by researchers at the University of Cambridge, Queen Mary University London, and Full Fact.
|
| 142 |
+
</p>
|
| 143 |
+
</div>
|
| 144 |
+
</section>
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
<script>
|
| 150 |
+
let suggestedPledge = null;
|
| 151 |
+
let currentAbortController = null;
|
| 152 |
+
const feedbackData = {};
|
| 153 |
+
let lastUsedFile = null;
|
| 154 |
+
let lastUserId = null;
|
| 155 |
+
let lastTimestamp = null;
|
| 156 |
+
const checkBtn = document.getElementById("check");
|
| 157 |
+
|
| 158 |
+
const stepListStandard = {
|
| 159 |
+
1: "Retrieving evidence related to the pledge",
|
| 160 |
+
2: "Scraping documents from URLs",
|
| 161 |
+
3: "Generating more queries based on the retrieved evidence",
|
| 162 |
+
4: "Searching more articles",
|
| 163 |
+
5: "Scraping documents from URLs",
|
| 164 |
+
6: "Finding the most relevant documents",
|
| 165 |
+
7: "Extracting events from top documents",
|
| 166 |
+
8: "Sorting events temporally"
|
| 167 |
+
};
|
| 168 |
+
|
| 169 |
+
const stepListSuggestion = {
|
| 170 |
+
1: "Generating queries to retrieve evidence",
|
| 171 |
+
2: "Searching more articles",
|
| 172 |
+
3: "Scraping documents from URLs",
|
| 173 |
+
4: "Finding the most relevant documents",
|
| 174 |
+
5: "Extracting events from top documents",
|
| 175 |
+
6: "Sorting events temporally"
|
| 176 |
+
};
|
| 177 |
+
|
| 178 |
+
let stepList = stepListStandard;
|
| 179 |
+
|
| 180 |
+
function renderStatus(statusDict) {
|
| 181 |
+
let html = "<ul class='list-disc ml-6 space-y-1 text-sm'>";
|
| 182 |
+
for (let step in stepList) {
|
| 183 |
+
const raw = statusDict?.[step] || stepList[step];
|
| 184 |
+
const content = raw.replace(/\n/g, "<br>");
|
| 185 |
+
const prefix = statusDict?.[step] ? "β
" : "β³";
|
| 186 |
+
html += `<li>${prefix} Step ${step}: ${content}</li>`;
|
| 187 |
+
}
|
| 188 |
+
html += "</ul>";
|
| 189 |
+
return html;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
function setDefaultDate() {
|
| 193 |
+
const input = document.getElementById("pledge-date");
|
| 194 |
+
input.value = "2024-07-04";
|
| 195 |
+
document.getElementById("date-warning").classList.add("hidden");
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
function setDefaultAuthor() {
|
| 199 |
+
const input = document.getElementById("pledge-author");
|
| 200 |
+
input.value = "Labour";
|
| 201 |
+
document.getElementById("author-warning").classList.add("hidden");
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
// function setFeedback(index, answer) {
|
| 205 |
+
// feedbackData[index] = answer;
|
| 206 |
+
// const message = document.getElementById(`msg-${index}`);
|
| 207 |
+
// message.textContent = `β Selected: ${answer ? 'Yes' : 'No'}`;
|
| 208 |
+
// message.className = answer
|
| 209 |
+
// ? "text-sm text-green-600 mt-1"
|
| 210 |
+
// : "text-sm text-red-600 mt-1";
|
| 211 |
+
// }
|
| 212 |
+
function setFeedback(index, answer) {
|
| 213 |
+
feedbackData[index] = answer;
|
| 214 |
+
const message = document.getElementById(`msg-${index}`);
|
| 215 |
+
|
| 216 |
+
let displayText = "";
|
| 217 |
+
let colorClass = "";
|
| 218 |
+
|
| 219 |
+
switch(answer) {
|
| 220 |
+
case "not_relevant":
|
| 221 |
+
displayText = "Not relevant";
|
| 222 |
+
colorClass = "text-red-300";
|
| 223 |
+
break;
|
| 224 |
+
case "relevant_seen":
|
| 225 |
+
displayText = "Relevant but already seen";
|
| 226 |
+
colorClass = "text-grey-400";
|
| 227 |
+
break;
|
| 228 |
+
case "relevant_updated":
|
| 229 |
+
displayText = "Relevant and up-to-date";
|
| 230 |
+
colorClass = "text-blue-400";
|
| 231 |
+
break;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
message.textContent = `β Selected: ${displayText}`;
|
| 235 |
+
message.className = `text-sm ${colorClass} mt-1`;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
function pollStatus(userId, timestamp, statusElement) {
|
| 239 |
+
if (window.pollIntervalId) {
|
| 240 |
+
clearInterval(window.pollIntervalId);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
window.pollIntervalId = setInterval(async () => {
|
| 244 |
+
try {
|
| 245 |
+
const res = await fetch(`/api/status?user_id=${userId}×tamp=${timestamp}&_=${Date.now()}`);
|
| 246 |
+
const data = await res.json();
|
| 247 |
+
|
| 248 |
+
if (data.status) {
|
| 249 |
+
statusElement.innerHTML = renderStatus(data.status);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
const values = Object.values(data.status || {});
|
| 253 |
+
const finalText = values.join(" ").toLowerCase();
|
| 254 |
+
|
| 255 |
+
if (finalText.includes("done") || finalText.includes("finished")) {
|
| 256 |
+
clearInterval(window.pollIntervalId);
|
| 257 |
+
window.pollIntervalId = null;
|
| 258 |
+
statusElement.innerHTML += `<div class="mt-2 text-green-600 font-semibold">β
All done.</div>`;
|
| 259 |
+
checkBtn.disabled = false;
|
| 260 |
+
checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
|
| 261 |
+
|
| 262 |
+
suggestedPledge = null;
|
| 263 |
+
|
| 264 |
+
const waitForFile = setInterval(() => {
|
| 265 |
+
if (lastUsedFile) {
|
| 266 |
+
clearInterval(waitForFile);
|
| 267 |
+
loadEvents(lastUsedFile);
|
| 268 |
+
}
|
| 269 |
+
}, 200);
|
| 270 |
+
} else if (Object.values(data.status || {}).some(v => v.startsWith("β"))) {
|
| 271 |
+
clearInterval(window.pollIntervalId);
|
| 272 |
+
window.pollIntervalId = null;
|
| 273 |
+
statusElement.innerHTML += `<div class="mt-2 text-red-600 font-semibold">β The process failed.</div>`;
|
| 274 |
+
checkBtn.disabled = false;
|
| 275 |
+
checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
|
| 276 |
+
}
|
| 277 |
+
} catch (err) {
|
| 278 |
+
clearInterval(window.pollIntervalId);
|
| 279 |
+
window.pollIntervalId = null;
|
| 280 |
+
statusElement.innerHTML = `<div class="text-red-600">β Failed to check status: ${err.message}</div>`;
|
| 281 |
+
}
|
| 282 |
+
}, 2000);
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
async function submitAllFeedback() {
|
| 288 |
+
const entries = Object.entries(feedbackData);
|
| 289 |
+
if (entries.length === 0) {
|
| 290 |
+
alert("No feedback to submit!");
|
| 291 |
+
return;
|
| 292 |
+
}
|
| 293 |
+
const confirmed = confirm("By submitting feedback, you agree that your feedback may be collected for our analysis. Your data will be anonymised and stored securely. No personal information will be recorded. If you do not wish to take part, please cancel this submission.");
|
| 294 |
+
if (!confirmed) return;
|
| 295 |
+
|
| 296 |
+
const pledgeText = document.getElementById("claim").value.trim();
|
| 297 |
+
|
| 298 |
+
const res = await fetch('/api/feedback', {
|
| 299 |
+
method: 'POST',
|
| 300 |
+
headers: { 'Content-Type': 'application/json' },
|
| 301 |
+
body: JSON.stringify({
|
| 302 |
+
pledge: pledgeText,
|
| 303 |
+
file: lastUsedFile,
|
| 304 |
+
user_id: lastUserId,
|
| 305 |
+
timestamp: lastTimestamp,
|
| 306 |
+
feedback: entries.map(([index, answer]) => ({
|
| 307 |
+
eventIndex: index,
|
| 308 |
+
answer: answer
|
| 309 |
+
}))
|
| 310 |
+
})
|
| 311 |
+
});
|
| 312 |
+
|
| 313 |
+
alert(res.ok ? "β
Feedback submitted successfully!" : "β Submission failed.");
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
async function loadEvents(file) {
|
| 317 |
+
const resultBox = document.getElementById("result");
|
| 318 |
+
const p = resultBox.querySelector("p");
|
| 319 |
+
resultBox.classList.remove("hidden");
|
| 320 |
+
|
| 321 |
+
try {
|
| 322 |
+
const fileParam = encodeURIComponent(file);
|
| 323 |
+
const eventsRes = await fetch(`/api/events?file=${fileParam}`);
|
| 324 |
+
if (!eventsRes.ok) throw new Error("β Event file not found or malformed");
|
| 325 |
+
const data = await eventsRes.json();
|
| 326 |
+
if (!Array.isArray(data)) throw new Error("β Unexpected data format");
|
| 327 |
+
|
| 328 |
+
if (data.length === 0) {
|
| 329 |
+
p.innerHTML = `<div class="text-gray-500 italic"> Sorry, we do not find any progress for this pledge.</div>`;
|
| 330 |
+
return;
|
| 331 |
+
}
|
| 332 |
+
// p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
|
| 333 |
+
// data.map((e, index) => `
|
| 334 |
+
p.innerHTML =
|
| 335 |
+
data.map((e, index) => `
|
| 336 |
+
<div class="mb-6 border-b pb-4">
|
| 337 |
+
ποΈ <b>${e.date}</b>: ${e.event}<br>
|
| 338 |
+
π <a href="${e.url}" target="_blank" class="text-purple-400 underline">Source</a>
|
| 339 |
+
|
| 340 |
+
<div class="mt-3">
|
| 341 |
+
<label class="block text-sm font-medium mb-2">How relevant is this event?</label>
|
| 342 |
+
<div class="flex flex-wrap gap-2">
|
| 343 |
+
<button onclick="setFeedback(${index}, 'not_relevant')"
|
| 344 |
+
class="px-3 py-1.5 bg-gray-100 hover:bg-gray-200 border border-gray-300 rounded-lg text-gray-700">
|
| 345 |
+
Not relevant
|
| 346 |
+
</button>
|
| 347 |
+
<button onclick="setFeedback(${index}, 'relevant_seen')"
|
| 348 |
+
class="px-3 py-1.5 bg-blue-100 hover:bg-blue-200 border border-blue-300 rounded-lg text-blue-700">
|
| 349 |
+
Relevant but seen
|
| 350 |
+
</button>
|
| 351 |
+
<button onclick="setFeedback(${index}, 'relevant_updated')"
|
| 352 |
+
class="px-3 py-1.5 bg-green-100 hover:bg-green-200 border border-green-300 rounded-lg text-green-700">
|
| 353 |
+
Relevant & up-to-date
|
| 354 |
+
</button>
|
| 355 |
+
</div>
|
| 356 |
+
<div id="msg-${index}" class="text-sm mt-1"></div>
|
| 357 |
+
</div>
|
| 358 |
+
</div>
|
| 359 |
+
`).join('') +
|
| 360 |
+
`<button onclick="submitAllFeedback()" class="mt-6 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
|
| 361 |
+
π€ Submit All Feedback
|
| 362 |
+
</button>
|
| 363 |
+
<button onclick="window.location.href='/download?file=${fileParam}'" class="mt-4 ml-4 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
|
| 364 |
+
π
Download Excel
|
| 365 |
+
</button>`;
|
| 366 |
+
} catch (err) {
|
| 367 |
+
p.textContent = `β Failed to load timeline: ${err.message}`;
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
let suggestTimer = null;
|
| 372 |
+
document.getElementById("claim").addEventListener("input", () => {
|
| 373 |
+
suggestedPledge = null;
|
| 374 |
+
clearTimeout(suggestTimer);
|
| 375 |
+
suggestTimer = setTimeout(fetchSuggestions, 300); // 300ms delay to avoid flooding
|
| 376 |
+
});
|
| 377 |
+
|
| 378 |
+
async function fetchSuggestions() {
|
| 379 |
+
const claimText = document.getElementById("claim").value.trim();
|
| 380 |
+
const suggestionBox = document.getElementById("similar-suggestions");
|
| 381 |
+
|
| 382 |
+
if (!claimText) {
|
| 383 |
+
suggestionBox.classList.add("hidden");
|
| 384 |
+
return;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
const res = await fetch("/api/similar-pledges", {
|
| 388 |
+
method: "POST",
|
| 389 |
+
headers: { "Content-Type": "application/json" },
|
| 390 |
+
body: JSON.stringify({ claim: claimText })
|
| 391 |
+
});
|
| 392 |
+
const data = await res.json();
|
| 393 |
+
const suggestions = data.suggestions || [];
|
| 394 |
+
|
| 395 |
+
if (suggestions.length === 0) {
|
| 396 |
+
suggestionBox.classList.add("hidden");
|
| 397 |
+
} else {
|
| 398 |
+
const author = "Labour";
|
| 399 |
+
const date = "2024-07-04";
|
| 400 |
+
suggestionBox.innerHTML =
|
| 401 |
+
"<div class='font-semibold mb-1'>π‘ Are you fact-checking this pledge? </div>" +
|
| 402 |
+
"<ul class='list-disc ml-6 mt-1'>" +
|
| 403 |
+
suggestions.map(s => `
|
| 404 |
+
<li class="mb-2">
|
| 405 |
+
${author}: ${s.text} (${date})
|
| 406 |
+
<button
|
| 407 |
+
onclick="useSuggestedPledge('${s.text.replace(/'/g, "\\'")}', ${s.index})"
|
| 408 |
+
class="ml-2 px-2 py-1 text-xs bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500">
|
| 409 |
+
Fact-check this pledge
|
| 410 |
+
</button>
|
| 411 |
+
</li>
|
| 412 |
+
`).join("") +
|
| 413 |
+
"</ul>";
|
| 414 |
+
suggestionBox.classList.remove("hidden");
|
| 415 |
+
}
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
checkBtn.addEventListener("click", async () => {
|
| 420 |
+
const claim = document.getElementById("claim").value.trim();
|
| 421 |
+
const pledgeDate = document.getElementById("pledge-date").value.trim();
|
| 422 |
+
const pledgeAuthor = document.getElementById("pledge-author").value.trim();
|
| 423 |
+
const statusElement = document.getElementById("status");
|
| 424 |
+
const resultBox = document.getElementById("result");
|
| 425 |
+
// resultBox.classList.remove("hidden");
|
| 426 |
+
const p = resultBox.querySelector("p");
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
let valid = true;
|
| 431 |
+
if (!claim) {
|
| 432 |
+
alert("Please enter the pledge text.");
|
| 433 |
+
valid = false;
|
| 434 |
+
}
|
| 435 |
+
if (!pledgeDate) {
|
| 436 |
+
document.getElementById("date-warning").classList.remove("hidden");
|
| 437 |
+
valid = false;
|
| 438 |
+
}
|
| 439 |
+
if (!pledgeAuthor) {
|
| 440 |
+
document.getElementById("author-warning").classList.remove("hidden");
|
| 441 |
+
valid = false;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
if (!valid) return;
|
| 445 |
+
|
| 446 |
+
checkBtn.disabled = true;
|
| 447 |
+
checkBtn.classList.add("opacity-50", "cursor-not-allowed");
|
| 448 |
+
|
| 449 |
+
// document.getElementById("status").classList.remove("hidden");
|
| 450 |
+
// statusElement.innerHTML = renderStatus({});
|
| 451 |
+
// document.getElementById("result").classList.remove("hidden");
|
| 452 |
+
// document.getElementById("progress").classList.remove("hidden");
|
| 453 |
+
|
| 454 |
+
document.getElementById("status").innerHTML = "";
|
| 455 |
+
document.getElementById("result").classList.add("hidden");
|
| 456 |
+
document.getElementById("progress").classList.add("hidden");
|
| 457 |
+
document.getElementById("result").querySelector("p").innerHTML = "";
|
| 458 |
+
if (window.pollIntervalId) {
|
| 459 |
+
clearInterval(window.pollIntervalId);
|
| 460 |
+
window.pollIntervalId = null;
|
| 461 |
+
}
|
| 462 |
+
Object.keys(feedbackData).forEach(key => delete feedbackData[key]);
|
| 463 |
+
lastUsedFile = null;
|
| 464 |
+
lastUserId = null;
|
| 465 |
+
lastTimestamp = null;
|
| 466 |
+
|
| 467 |
+
// π ε―δ»₯ι’ε
ζΎη€Ίζη€Ί
|
| 468 |
+
document.getElementById("result").querySelector("p").textContent = "β³ Please wait, checking...";
|
| 469 |
+
document.getElementById("progress").classList.remove("hidden");
|
| 470 |
+
document.getElementById("result").classList.remove("hidden");
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
try {
|
| 474 |
+
const timeRange = document.getElementById("time-range").value;
|
| 475 |
+
// const pledgeDate = document.getElementById("pledge-date").value;
|
| 476 |
+
// const pledgeAuthor = document.getElementById("pledge-author").value;
|
| 477 |
+
if (currentAbortController) currentAbortController.abort();
|
| 478 |
+
currentAbortController = new AbortController();
|
| 479 |
+
const signal = currentAbortController.signal;
|
| 480 |
+
let valid = true;
|
| 481 |
+
|
| 482 |
+
stepList = (suggestedPledge !== null) ? stepListSuggestion : stepListStandard;
|
| 483 |
+
|
| 484 |
+
if (!pledgeDate) {
|
| 485 |
+
document.getElementById("date-warning").classList.remove("hidden");
|
| 486 |
+
valid = false;
|
| 487 |
+
}
|
| 488 |
+
if (!pledgeAuthor) {
|
| 489 |
+
document.getElementById("author-warning").classList.remove("hidden");
|
| 490 |
+
valid = false;
|
| 491 |
+
}
|
| 492 |
+
if (!valid) return;
|
| 493 |
+
|
| 494 |
+
const userId = Math.random().toString(36).substring(2, 10);
|
| 495 |
+
const now = new Date();
|
| 496 |
+
const timestamp = now.toISOString().replace(/[:.]/g, "-").slice(0, 19);
|
| 497 |
+
statusElement.textContent = "";
|
| 498 |
+
// pollStatus(userId, timestamp, p);
|
| 499 |
+
pollStatus(userId, timestamp, document.getElementById("status"));
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
const runRes = await fetch("/api/run-model", {
|
| 503 |
+
method: "POST",
|
| 504 |
+
headers: { "Content-Type": "application/json" },
|
| 505 |
+
body: JSON.stringify({
|
| 506 |
+
claim,
|
| 507 |
+
time_range: timeRange,
|
| 508 |
+
pledge_date: pledgeDate,
|
| 509 |
+
pledge_author: pledgeAuthor,
|
| 510 |
+
user_id: userId,
|
| 511 |
+
timestamp: timestamp,
|
| 512 |
+
signal: signal,
|
| 513 |
+
suggestion_meta: suggestedPledge
|
| 514 |
+
})
|
| 515 |
+
});
|
| 516 |
+
|
| 517 |
+
const runData = await runRes.json();
|
| 518 |
+
|
| 519 |
+
lastUsedFile = runData.file;
|
| 520 |
+
lastUserId = runData.user_id;
|
| 521 |
+
lastTimestamp = runData.timestamp;
|
| 522 |
+
} catch (err) {
|
| 523 |
+
if (err.name === "AbortError") {
|
| 524 |
+
console.log("Previous request aborted.");
|
| 525 |
+
checkBtn.disabled = false;
|
| 526 |
+
checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
|
| 527 |
+
return;
|
| 528 |
+
}
|
| 529 |
+
p.textContent = `β Failed to load timeline: ${err.message}`;
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
});
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
async function useSuggestedPledge(text, index) {
|
| 536 |
+
document.getElementById("claim").value = text;
|
| 537 |
+
document.getElementById("pledge-author").value = "Labour";
|
| 538 |
+
document.getElementById("pledge-date").value = "2024-07-04";
|
| 539 |
+
suggestedPledge = { text, index };
|
| 540 |
+
alert("β
This pledge has been filled in. You can now click 'Let's track!'");
|
| 541 |
+
await fetch("/api/log-similar-selection", {
|
| 542 |
+
method: "POST",
|
| 543 |
+
headers: { "Content-Type": "application/json" },
|
| 544 |
+
body: JSON.stringify({
|
| 545 |
+
selected_text: text,
|
| 546 |
+
index: index
|
| 547 |
+
})
|
| 548 |
+
});
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
</script>
|
| 552 |
+
</body>
|
| 553 |
+
</html>
|