yulongchen commited on
Commit
ad56f36
Β·
1 Parent(s): 7e3f286

Add system

Browse files
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+ COPY . /app
5
+
6
+ RUN pip install --no-cache-dir flask flask-cors pandas openpyxl
7
+
8
+ EXPOSE 7860
9
+
10
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
  title: Pledge Tracker
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Pledge Tracker
3
+ colorFrom: purple
4
+ colorTo: indigo
 
5
  sdk: gradio
6
+ sdk_version: 5.34.0
7
  app_file: app.py
8
  pinned: false
9
+ license: cc-by-nc-4.0
10
+ short_description: Track and fact-check pledges with supporting evidence.
11
+ ---
app.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify, send_file, request, send_from_directory
2
+ from flask_cors import CORS
3
+ import os, json, uuid, time
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ from huggingface_hub import HfApi
7
+ import sys
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from system.pledge_tracking import run_pipeline
11
+ from huggingface_hub import hf_hub_download
12
+ import spacy
13
+ import traceback
14
+ import threading
15
+
16
+ nlp = spacy.load("en_core_web_sm")
17
+
18
+ app = Flask(__name__, static_folder='.')
19
+ CORS(app)
20
+
21
+ HF_DATASET_REPO = "PledgeTracker/demo_feedback"
22
+ HF_TOKEN = os.environ.get("HF_TOKEN")
23
+ TMP_DIR = "tmp"
24
+ FEEDBACK_DIR = "feedback_logs"
25
+ os.makedirs(TMP_DIR, exist_ok=True)
26
+ os.makedirs(FEEDBACK_DIR, exist_ok=True)
27
+
28
+ REFERENCE_PLEDGES = []
29
+
30
+ REFERENCE_PLEDGE_PATH = hf_hub_download(
31
+ repo_id="PledgeTracker/demo_feedback",
32
+ filename="existing_pledges.txt",
33
+ repo_type="dataset",
34
+ token=os.environ["HF_TOKEN"]
35
+ )
36
+
37
+ if os.path.exists(REFERENCE_PLEDGE_PATH):
38
+ with open(REFERENCE_PLEDGE_PATH, "r") as f:
39
+ REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()]
40
+ else:
41
+ print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}")
42
+
43
+
44
+ def lemmatize(text):
45
+ doc = nlp(text)
46
+ return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
47
+
48
+
49
+ @app.route("/api/similar-pledges", methods=["POST"])
50
+ def similar_pledges():
51
+ data = request.get_json()
52
+ claim = data.get("claim", "").strip()
53
+ if not claim or not REFERENCE_PLEDGES:
54
+ return jsonify({"suggestions": []})
55
+
56
+ all_pledges = [claim] + REFERENCE_PLEDGES
57
+ lemmatized_pledges = [lemmatize(p) for p in all_pledges]
58
+
59
+ vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges)
60
+ similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
61
+ filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3]
62
+ top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5]
63
+
64
+ suggestions = [
65
+ {"text": REFERENCE_PLEDGES[i], "index": int(i)}
66
+ for i, score in top_filtered
67
+ ]
68
+
69
+ return jsonify({"suggestions": suggestions})
70
+
71
+
72
+ def calculate_time_range(option: str, pledge_date: str = None):
73
+ today = datetime.today()
74
+ # pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
75
+
76
+ if isinstance(pledge_date, str):
77
+ pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
78
+ elif not isinstance(pledge_date, datetime):
79
+ raise ValueError("pledge_date must be a str or datetime")
80
+
81
+ if option == "week":
82
+ one_week_ago = today - timedelta(days=7)
83
+ start = max(one_week_ago, pledge_date)
84
+ elif option == "month":
85
+ one_month_ago = today - timedelta(days=30)
86
+ start = max(one_month_ago, pledge_date)
87
+ elif option == "since_pledge_date":
88
+ if not pledge_date:
89
+ raise ValueError("Pledge date is required for 'since_pledge_date' option")
90
+ start = pledge_date
91
+ else:
92
+ raise ValueError("Invalid time range option")
93
+ print(start, pledge_date)
94
+ return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
95
+
96
+ @app.route("/")
97
+ def serve_html():
98
+ return send_from_directory('.', 'test.html')
99
+
100
+ @app.route("/api/status")
101
+ def check_status():
102
+ user_id = request.args.get("user_id")
103
+ timestamp = request.args.get("timestamp")
104
+ log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
105
+ if not os.path.exists(log_file_path):
106
+ return jsonify({"status": {}}), 200
107
+ try:
108
+ with open(log_file_path, "r") as f:
109
+ status = json.load(f)
110
+ except Exception:
111
+ status = {}
112
+
113
+ return jsonify({"status": status})
114
+
115
+
116
+ @app.route("/api/run-model", methods=["POST"])
117
+ def run_model():
118
+ data = request.get_json()
119
+ claim = data.get("claim", "no input")
120
+ time_range_option = data.get("time_range", "month")
121
+ system_start_time = datetime.now()
122
+
123
+ suggestion_meta = data.get("suggestion_meta")
124
+ pledge_date = data.get("pledge_date", "")
125
+ pledge_author = data.get("pledge_author", "")
126
+ timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S")
127
+ user_id = data.get("user_id") or str(uuid.uuid4())[:8]
128
+
129
+ log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
130
+
131
+ status_lock = threading.Lock()
132
+
133
+ def update_status(step_id, msg):
134
+ print(f"[STATUS] Step {step_id}: {msg}")
135
+ with status_lock:
136
+ if os.path.exists(log_file_path):
137
+ try:
138
+ with open(log_file_path, "r") as f:
139
+ current = json.load(f)
140
+ except Exception:
141
+ current = {}
142
+ else:
143
+ current = {}
144
+ current[str(step_id)] = f"{msg}"
145
+ with open(log_file_path, "w") as f:
146
+ json.dump(current, f, indent=2)
147
+
148
+ try:
149
+ time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
150
+ print(f"[DEMO] Received claim: {claim}")
151
+ print(f"[DEMO] Time range: {time_start} ~ {time_end}")
152
+ print(f"[DEMO] Pledge date range: {pledge_date}")
153
+
154
+ # user_id = str(uuid.uuid4())[:8]
155
+ # outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
156
+
157
+
158
+ update_status(0, "πŸ“Œ Starting the system ...")
159
+ print(suggestion_meta)
160
+
161
+ outputs = run_pipeline(
162
+ claim, pledge_date, pledge_author, time_start, timestamp, user_id,
163
+ update_fn=update_status, suggestion_meta=suggestion_meta
164
+ )
165
+
166
+ df = pd.read_excel(outputs["sorted_events"])
167
+ json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json")
168
+ df.to_json(json_path, orient="records", indent=2)
169
+
170
+
171
+ system_end_time = datetime.now()
172
+ runtime = system_end_time - system_start_time
173
+
174
+ events = df.to_dict(orient="records")
175
+ log_entry = {
176
+ "requested_time": timestamp,
177
+ "user_id": user_id,
178
+ "pledge": claim,
179
+ "suggestion_meta": suggestion_meta,
180
+ "time_start": time_start,
181
+ "time_end": time_end,
182
+ "runtime": runtime.total_seconds(),
183
+ "pledge_author": pledge_author,
184
+ "pledge_date": pledge_date,
185
+ "events": events
186
+ }
187
+ default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
188
+
189
+ with open(default_log_path, "w") as f:
190
+ f.write(json.dumps(log_entry, indent=1))
191
+
192
+ tsv_path = outputs["augmented_tsv_file"]
193
+
194
+ try:
195
+ api = HfApi()
196
+ api.upload_file(
197
+ path_or_fileobj=default_log_path,
198
+ path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
199
+ repo_id=HF_DATASET_REPO,
200
+ repo_type="dataset",
201
+ token=HF_TOKEN
202
+ )
203
+ api.upload_file(
204
+ path_or_fileobj=tsv_path,
205
+ path_in_repo=f"logs/augmented_{timestamp}_{user_id}.tsv",
206
+ repo_id=HF_DATASET_REPO,
207
+ repo_type="dataset",
208
+ token=HF_TOKEN
209
+ )
210
+
211
+
212
+ except Exception as e:
213
+ traceback.print_exc()
214
+ print(f"[Default Feedback Upload Error] {e}")
215
+
216
+ return jsonify({
217
+ "status": "success",
218
+ "file": f"{timestamp}_{user_id}.json",
219
+ "user_id": user_id,
220
+ "timestamp": timestamp
221
+ })
222
+ except Exception as e:
223
+ traceback.print_exc()
224
+ return jsonify({"status": "error", "detail": str(e)}), 500
225
+
226
+ @app.route("/api/events")
227
+ def get_events():
228
+ filename = request.args.get("file")
229
+ file_path = os.path.join(TMP_DIR, filename)
230
+
231
+ if not os.path.exists(file_path):
232
+ return jsonify({"error": "File not found"}), 404
233
+
234
+ with open(file_path, "r") as f:
235
+ events = json.load(f)
236
+
237
+ return jsonify(events)
238
+
239
+
240
+ @app.route("/api/feedback", methods=["POST"])
241
+ def receive_feedback():
242
+ data = request.get_json()
243
+ pledge = data.get("pledge", "no_pledge_text")
244
+ feedback_list = data.get("feedback", [])
245
+ filename = data.get("file")
246
+ file_path = os.path.join(TMP_DIR, filename)
247
+
248
+ timestamp = data.get("timestamp")
249
+ user_id = data.get("user_id")
250
+
251
+ if not user_id or not timestamp:
252
+ return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
253
+
254
+ if not os.path.exists(file_path):
255
+ return jsonify({"error": "Event file not found"}), 400
256
+
257
+ with open(file_path, "r") as f:
258
+ events = json.load(f)
259
+
260
+ suggestion_meta = None
261
+ time_start = None
262
+ time_end = None
263
+ try:
264
+ prev_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
265
+ with open(prev_log_path, "r") as f:
266
+ previous_log = json.load(f)
267
+ suggestion_meta = previous_log.get("suggestion_meta")
268
+ time_start = previous_log.get("time_start")
269
+ time_end = previous_log.get("time_end")
270
+ pledge_author = previous_log.get("pledge_author")
271
+ pledge_date = previous_log.get("pledge_date")
272
+ runtime = previous_log.get("runtime")
273
+ except Exception:
274
+ pass
275
+
276
+ feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
277
+ for idx, event in enumerate(events):
278
+ event["user_feedback"] = feedback_dict.get(idx)
279
+
280
+ log_entry = {
281
+ "requested_time": timestamp,
282
+ "user_id": user_id,
283
+ "pledge": pledge,
284
+ "suggestion_meta": suggestion_meta,
285
+ "time_start": time_start,
286
+ "time_end": time_end,
287
+ "runtime": runtime,
288
+ "pledge_author": pledge_author,
289
+ "pledge_date": pledge_date,
290
+ "events": events
291
+ }
292
+
293
+ local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
294
+ with open(local_filename, "w") as f:
295
+ f.write(json.dumps(log_entry, indent=1))
296
+
297
+ try:
298
+ api = HfApi()
299
+ api.upload_file(
300
+ path_or_fileobj=local_filename,
301
+ path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
302
+ repo_id=HF_DATASET_REPO,
303
+ repo_type="dataset",
304
+ token=HF_TOKEN
305
+ )
306
+ except Exception as e:
307
+ return jsonify({'status': 'partial_success', 'error': str(e)}), 500
308
+
309
+ return jsonify({'status': 'success'})
310
+
311
+
312
+ @app.route("/download-feedback/<filename>")
313
+ def download_feedback_file(filename):
314
+ return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True)
315
+
316
+ @app.route("/feedback-files")
317
+ def list_feedback_files():
318
+ files = os.listdir(FEEDBACK_DIR)
319
+ return jsonify(sorted(files))
320
+
321
+ @app.route("/download")
322
+ def download_excel():
323
+ file = request.args.get("file")
324
+ if not file:
325
+ return "Missing file param", 400
326
+
327
+ json_path = os.path.join(TMP_DIR, file)
328
+ if not os.path.exists(json_path):
329
+ return "Event file not found", 404
330
+
331
+ with open(json_path, "r") as f:
332
+ data = json.load(f)
333
+
334
+ df = pd.DataFrame(data)
335
+ xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx"))
336
+ df.to_excel(xlsx_path, index=False)
337
+
338
+ return send_file(xlsx_path, as_attachment=True)
339
+
340
+
341
+ if __name__ == '__main__':
342
+ app.run(host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ flask_cors
3
+ pandas
4
+ openpyxl
5
+ huggingface_hub
6
+ PyMuPDF==1.23.25
7
+ huggingface_hub==0.30.2
8
+ lxml==5.3.1
9
+ nltk==3.9.1
10
+ numpy==2.2.6
11
+ openai==1.84.0
12
+ pandas==2.3.0
13
+ rank_bm25==0.2.2
14
+ Requests==2.32.3
15
+ scikit_learn==1.7.0
16
+ sentence_transformers==3.3.1
17
+ spacy==3.8.2
18
+ tiktoken==0.7.0
19
+ torch==2.6.0
20
+ tqdm
21
+ trafilatura==2.0.0
22
+ transformers==4.51.3
23
+ vllm==0.8.4
24
+ accelerate
25
+
system/.DS_Store ADDED
Binary file (8.2 kB). View file
 
system/__init__.py ADDED
File without changes
system/__pycache__/augmented_searching.cpython-312.pyc ADDED
Binary file (4.73 kB). View file
 
system/__pycache__/ee.cpython-312.pyc ADDED
Binary file (4.71 kB). View file
 
system/__pycache__/generate_output.cpython-312.pyc ADDED
Binary file (3.47 kB). View file
 
system/__pycache__/hero_pipeline.cpython-312.pyc ADDED
Binary file (6.22 kB). View file
 
system/__pycache__/html2lines.cpython-312.pyc ADDED
Binary file (3.15 kB). View file
 
system/__pycache__/initial_searching.cpython-312.pyc ADDED
Binary file (5.25 kB). View file
 
system/__pycache__/process_time.cpython-312.pyc ADDED
Binary file (8.95 kB). View file
 
system/__pycache__/scraper.cpython-312.pyc ADDED
Binary file (4.76 kB). View file
 
system/augmented_searching.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import requests
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ from .date_verifier import is_after_start
7
+
8
+ def google_search(query, api_key, search_engine_id, start_date, end_date):
9
+ # print(f"[SYSTEM] Calling Google Search API for: {query}")
10
+ sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
11
+ url = "https://www.googleapis.com/customsearch/v1"
12
+ params = {
13
+ "q": query,
14
+ "key": api_key,
15
+ "cx": search_engine_id,
16
+ "num": 10,
17
+ "sort": sort,
18
+ "cr": "countryUK",
19
+ "gl": "uk"
20
+ }
21
+ try:
22
+ response = requests.get(url, params=params)
23
+ response.raise_for_status()
24
+ return response.json().get("items", [])
25
+ except Exception as e:
26
+ print(f"[ERROR] Google Search Failed: {e}")
27
+ return []
28
+
29
+ def save_tsv(file_name, id_value, string_value, value_list, query):
30
+
31
+ data = {
32
+ 'ID': id_value,
33
+ 'String': string_value,
34
+ 'ListValue': value_list,
35
+ 'query': query
36
+ }
37
+ df = pd.DataFrame(data)
38
+ df.to_csv(file_name, sep='\t', index=False, header=False)
39
+
40
+ def ensure_directory_exists(path):
41
+ dir_path = Path(path).expanduser().resolve().parent
42
+ if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
43
+ raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
44
+ dir_path.mkdir(parents=True, exist_ok=True)
45
+
46
+ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
47
+ if suggestion_meta==None:
48
+ qa_lines = open(f"{qa_file}","r").read()
49
+ qa_lines = json.loads(qa_lines)
50
+ claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
51
+ idx=0
52
+ else:
53
+ # claim_text = suggestion_meta["text"]
54
+ idx = suggestion_meta["index"]
55
+ qa_lines = open(f"{qa_file}","r").readlines()[idx]
56
+ qa_lines = json.loads(qa_lines)
57
+ claim_text = f"{qa_lines['claim']}"
58
+
59
+
60
+ api_key = os.environ.get("GOOGLE_API_KEY")
61
+ search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
62
+ if not api_key or not search_engine_id:
63
+ raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
64
+
65
+ # base_dir = pipeline_base_dir
66
+
67
+ tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
68
+ ensure_directory_exists(tsv_file_path)
69
+
70
+
71
+ urls = []
72
+ string_values = []
73
+ queries = []
74
+ questions = []
75
+ questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
76
+ questions = questions[:10]
77
+
78
+
79
+ results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
80
+ for result in results:
81
+ if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
82
+ string_values.append("claim")
83
+ urls.append(result["link"])
84
+ queries.append(f"{pledge_author}: {claim_text}")
85
+
86
+ for question in questions:
87
+ results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
88
+ for result in results:
89
+ if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
90
+ string_values.append("question")
91
+ urls.append(result["link"])
92
+ queries.append(f"{question}")
93
+
94
+ urls = list(dict.fromkeys(urls))
95
+
96
+ save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
97
+ print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
98
+ return str(tsv_file_path)
system/baseline/hyde_fc_generation_optimized.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vllm import LLM, SamplingParams
2
+ import json
3
+ import torch
4
+ import time
5
+ from datetime import datetime, timedelta
6
+ import argparse
7
+ from tqdm import tqdm
8
+ from typing import List, Dict, Any
9
+ import concurrent.futures
10
+
11
+ class VLLMGenerator:
12
+ def __init__(self, model_name: str, n: int = 8, max_tokens: int = 512,
13
+ temperature: float = 0.7, top_p: float = 1.0,
14
+ frequency_penalty: float = 0.0, presence_penalty: float = 0.0,
15
+ stop: List[str] = ['\n\n\n'], batch_size: int = 32):
16
+ self.device_count = torch.cuda.device_count()
17
+ print(f"Initializing with {self.device_count} GPUs")
18
+ self.llm = LLM(
19
+ model=model_name,
20
+ tensor_parallel_size=self.device_count,
21
+ max_model_len=4096,
22
+ gpu_memory_utilization=0.95,
23
+ enforce_eager=True,
24
+ trust_remote_code=True,
25
+ # quantization="bitsandbytes",
26
+ # dtype="half",
27
+ # load_format="bitsandbytes",
28
+ max_num_batched_tokens=4096,
29
+ max_num_seqs=batch_size
30
+ )
31
+ self.sampling_params = SamplingParams(
32
+ n=n,
33
+ max_tokens=max_tokens,
34
+ temperature=temperature,
35
+ top_p=top_p,
36
+ frequency_penalty=frequency_penalty,
37
+ presence_penalty=presence_penalty,
38
+ stop=stop,
39
+ logprobs=1
40
+ )
41
+ self.batch_size = batch_size
42
+ self.tokenizer = self.llm.get_tokenizer()
43
+ print(f"Initialization complete. Batch size: {batch_size}")
44
+
45
+ def parse_response(self, responses):
46
+ all_outputs = []
47
+ for response in responses:
48
+ to_return = []
49
+ for output in response.outputs:
50
+ text = output.text.strip()
51
+ try:
52
+ logprob = sum(logprob_obj.logprob for item in output.logprobs for logprob_obj in item.values())
53
+ except:
54
+ logprob = 0 # Fallback if logprobs aren't available
55
+ to_return.append((text, logprob))
56
+ texts = [r[0] for r in sorted(to_return, key=lambda tup: tup[1], reverse=True)]
57
+ all_outputs.append(texts)
58
+ return all_outputs
59
+
60
+ def prepare_prompt(self, claim: str, model_name: str) -> str:
61
+ base_prompt = f"Please write a fact-checking article passage to support, refute, indicate not enough evidence, or present conflicting evidence regarding the claim.\nClaim: {claim}"
62
+
63
+ if "OLMo" in model_name:
64
+ return base_prompt
65
+ else:
66
+ messages = [{"role": "user", "content": base_prompt}]
67
+ return self.tokenizer.apply_chat_template(messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\nPassage: "
68
+
69
+ def process_batch(self, batch: List[Dict[str, Any]], model_name: str) -> tuple[List[Dict[str, Any]], float]:
70
+ start_time = time.time()
71
+ prompts = [self.prepare_prompt(example["claim"], model_name) for example in batch]
72
+
73
+ try:
74
+ results = self.llm.generate(prompts, sampling_params=self.sampling_params)
75
+ outputs = self.parse_response(results)
76
+
77
+ for example, output in zip(batch, outputs):
78
+ example['hypo_fc_docs'] = output
79
+
80
+ batch_time = time.time() - start_time
81
+ return batch, batch_time
82
+ except Exception as e:
83
+ print(f"Error processing batch: {str(e)}")
84
+ return batch, time.time() - start_time
85
+
86
+ # def format_time(seconds: float) -> str:
87
+ # return str(timedelta(seconds=int(seconds)))
88
+
89
+ # def estimate_completion_time(start_time: float, processed_examples: int, total_examples: int) -> str:
90
+ # elapsed_time = time.time() - start_time
91
+ # examples_per_second = processed_examples / elapsed_time
92
+ # remaining_examples = total_examples - processed_examples
93
+ # estimated_remaining_seconds = remaining_examples / examples_per_second
94
+ # completion_time = datetime.now() + timedelta(seconds=int(estimated_remaining_seconds))
95
+ # return completion_time.strftime("%Y-%m-%d %H:%M:%S")
96
+
97
+ def main(args):
98
+ total_start_time = time.time()
99
+ print(f"Script started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
100
+
101
+ # Load data
102
+ print("Loading data...")
103
+ with open(args.target_data, 'r', encoding='utf-8') as json_file:
104
+ examples = json.load(json_file)
105
+ print(f"Loaded {len(examples)} examples")
106
+
107
+ # Initialize generator
108
+ print("Initializing generator...")
109
+ generator = VLLMGenerator(
110
+ model_name=args.model,
111
+ batch_size=32
112
+ )
113
+
114
+ # Process data in batches
115
+ processed_data = []
116
+ # batch_times = []
117
+ batches = [examples[i:i + generator.batch_size] for i in range(0, len(examples), generator.batch_size)]
118
+
119
+ print(f"\nProcessing {len(batches)} batches...")
120
+ with tqdm(total=len(examples), desc="Processing examples") as pbar:
121
+ for batch_idx, batch in enumerate(batches, 1):
122
+ processed_batch, batch_time = generator.process_batch(batch, args.model)
123
+ processed_data.extend(processed_batch)
124
+ # batch_times.append(batch_time)
125
+
126
+ # Update progress and timing information
127
+ # examples_processed = len(processed_data)
128
+ # avg_batch_time = sum(batch_times) / len(batch_times)
129
+ # estimated_completion = estimate_completion_time(total_start_time, examples_processed, len(examples))
130
+
131
+ # pbar.set_postfix({
132
+ # 'Batch': f"{batch_idx}/{len(batches)}",
133
+ # 'Avg Batch Time': f"{avg_batch_time:.2f}s",
134
+ # 'ETA': estimated_completion
135
+ # })
136
+ # pbar.update(len(batch))
137
+
138
+ # Calculate and display timing statistics
139
+ # total_time = time.time() - total_start_time
140
+ # avg_batch_time = sum(batch_times) / len(batch_times)
141
+ # avg_example_time = total_time / len(examples)
142
+
143
+ # print("\nTiming Statistics:")
144
+ # print(f"Total Runtime: {format_time(total_time)}")
145
+ # print(f"Average Batch Time: {avg_batch_time:.2f} seconds")
146
+ # print(f"Average Time per Example: {avg_example_time:.2f} seconds")
147
+ # print(f"Throughput: {len(examples)/total_time:.2f} examples/second")
148
+
149
+ # Save results
150
+ # print("\nSaving results...")
151
+ with open(args.json_output, "w", encoding="utf-8") as output_json:
152
+ json.dump(processed_data, output_json, ensure_ascii=False, indent=4)
153
+
154
+ # print(f"Script completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
155
+ # print(f"Total runtime: {format_time(total_time)}")
156
+
157
+ if __name__ == "__main__":
158
+ parser = argparse.ArgumentParser()
159
+ parser.add_argument('-i', '--target_data', default='data_store/averitec/dev.json')
160
+ parser.add_argument('-o', '--json_output', default='data_store/hyde_fc.json')
161
+ parser.add_argument('-m', '--model', default="meta-llama/Llama-3.1-8B-Instruct")
162
+ args = parser.parse_args()
163
+ main(args)
system/baseline/question_generation_optimized.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import time
4
+ import json
5
+ import nltk
6
+ from rank_bm25 import BM25Okapi
7
+ import numpy as np
8
+ import torch
9
+ from vllm import LLM, SamplingParams
10
+ from datetime import datetime, timedelta
11
+ from itertools import islice
12
+
13
+
14
+ def download_nltk_data(package_name, download_dir='nltk_data'):
15
+ # Ensure the download directory exists
16
+ os.makedirs(download_dir, exist_ok=True)
17
+
18
+ # Set NLTK data path
19
+ nltk.data.path.append(download_dir)
20
+
21
+ try:
22
+ # Try to find the resource
23
+ nltk.data.find(f'tokenizers/{package_name}')
24
+ print(f"Package '{package_name}' is already downloaded")
25
+ except LookupError:
26
+ # If resource isn't found, download it
27
+ print(f"Downloading {package_name}...")
28
+ nltk.download(package_name, download_dir=download_dir)
29
+ print(f"Successfully downloaded {package_name}")
30
+
31
+ # def format_time(seconds):
32
+ # """Format time duration nicely."""
33
+ # return str(timedelta(seconds=round(seconds)))
34
+
35
+ def claim2prompts(example):
36
+ claim = example["claim"]
37
+ claim_str = "Example [NUMBER]:||Claim: " + claim + "||Evidence: "
38
+
39
+ for question in example["questions"]:
40
+ q_text = question["question"].strip()
41
+ if len(q_text) == 0:
42
+ continue
43
+
44
+ if not q_text[-1] == "?":
45
+ q_text += "?"
46
+
47
+ answer_strings = []
48
+
49
+ for a in question["answers"]:
50
+ if a["answer_type"] in ["Extractive", "Abstractive"]:
51
+ answer_strings.append(a["answer"])
52
+ if a["answer_type"] == "Boolean":
53
+ answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
54
+
55
+ for a_text in answer_strings:
56
+ if not a_text[-1] in [".", "!", ":", "?"]:
57
+ a_text += "."
58
+
59
+ prompt_lookup_str = a_text
60
+ this_q_claim_str = claim_str + a_text.strip() + "||Question: " + q_text
61
+ yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n")[:1500])
62
+
63
+ def main(args):
64
+ # script_start = time.time()
65
+ # start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
66
+ # print(f"Script started at: {start_time}")
67
+ # print(f"Loading model: {args.model}")
68
+
69
+
70
+ download_nltk_data('punkt')
71
+ download_nltk_data('punkt_tab')
72
+
73
+ # Load and prepare reference corpus
74
+ # corpus_start = time.time()
75
+ with open(args.reference_corpus, "r", encoding="utf-8") as json_file:
76
+ train_examples = json.load(json_file)
77
+
78
+ prompt_corpus, tokenized_corpus = [], []
79
+ for example in train_examples:
80
+ for lookup_str, prompt in claim2prompts(example):
81
+ entry = nltk.word_tokenize(lookup_str)
82
+ tokenized_corpus.append(entry)
83
+ prompt_corpus.append(prompt)
84
+
85
+ prompt_bm25 = BM25Okapi(tokenized_corpus)
86
+ # print(f"Reference corpus processed in: {format_time(time.time() - corpus_start)}")
87
+
88
+ # Initialize vLLM with optimized settings
89
+ gpu_count = torch.cuda.device_count()
90
+ print(f"Using {gpu_count} GPU{'s' if gpu_count > 1 else ''}")
91
+
92
+ # model_start = time.time()
93
+ llm = LLM(
94
+ model=args.model,
95
+ tensor_parallel_size=gpu_count,
96
+ max_model_len=4096,
97
+ gpu_memory_utilization=0.95,
98
+ enforce_eager=True,
99
+ trust_remote_code=True,
100
+ # dtype="half",
101
+ )
102
+ llm.get_tokenizer().pad_token = "<|end_of_text|>"
103
+ # print(f"Model loaded in: {format_time(time.time() - model_start)}")
104
+
105
+ sampling_params = SamplingParams(
106
+ temperature=0.6,
107
+ top_p=0.9,
108
+ top_k=1,
109
+ skip_special_tokens=False,
110
+ max_tokens=512,
111
+ stop=['<|end_of_text|>', '</s>', '<|im_end|>', '[INST]', '[/INST]','<|eot_id|>','<|end|>','<|endoftext|>']
112
+ )
113
+
114
+ # processing_start = time.time()
115
+
116
+ # Load target data
117
+ target_examples = []
118
+ with open(args.top_k_target_knowledge, "r", encoding="utf-8") as json_file:
119
+ for line in json_file:
120
+ target_examples.append(json.loads(line))
121
+
122
+ if args.end == -1:
123
+ args.end = len(target_examples)
124
+ print(f"Processing {args.end} examples")
125
+
126
+ # Process in batches
127
+ with torch.no_grad():
128
+ with open(args.output_questions, "w", encoding="utf-8") as output_file:
129
+ for idx in range(0, args.end, args.batch_size):
130
+ batch_end = min(idx + args.batch_size, args.end)
131
+ current_batch = target_examples[idx:batch_end]
132
+ print(f"\nProcessing batch {idx}-{batch_end}...")
133
+
134
+ for example in current_batch:
135
+ # batch_start = time.time()
136
+ claim = example["claim"]
137
+ claim_id = example["claim_id"]
138
+ top_k_sentences_urls = example[f"top_{args.top_k}"]
139
+
140
+ batch_prompts = []
141
+ batch_metadata = []
142
+
143
+ # Prepare all prompts for current example
144
+ for sentences_urls in top_k_sentences_urls:
145
+ prompt_lookup_str = sentences_urls["sentence"]
146
+ url = sentences_urls["url"]
147
+
148
+ prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
149
+ prompt_n = 10
150
+ prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
151
+ prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
152
+
153
+ temp_prompt = "\n\n".join(prompt_docs)
154
+ for k in range(1, temp_prompt.count("[NUMBER]")+1):
155
+ temp_prompt = temp_prompt.replace("[NUMBER]", f"{k}", 1)
156
+
157
+ claim_prompt = "Your task is to generate a question based on the given claim and evidence. The question should clarify the relationship between the evidence and the claim\n\n"
158
+ evidence = prompt_lookup_str.replace("\n", " ")
159
+ full_prompt = claim_prompt + temp_prompt + "\n\nNow, generate a question that links the following claim and evidence:" + f"\n\nClaim: {claim}" + f"\nEvidence: {evidence}"
160
+
161
+ if "OLMo" in args.model:
162
+ inputs = [full_prompt]
163
+ else:
164
+ messages = [{"role":"user", "content":full_prompt}]
165
+ inputs = llm.get_tokenizer().apply_chat_template(messages, tokenize=False)
166
+ inputs += "<|start_header_id|>assistant<|end_header_id|>\n\nQuestion: "
167
+
168
+ batch_prompts.append(inputs)
169
+ batch_metadata.append((url, prompt_lookup_str))
170
+
171
+ # Process batch
172
+ outputs = llm.generate(batch_prompts, sampling_params)
173
+
174
+ # Process outputs
175
+ evidence = []
176
+ for output, (url, sent) in zip(outputs, batch_metadata):
177
+ question = output.outputs[0].text.strip().split("?")[0].replace("\n", " ") + "?"
178
+ evidence.append({
179
+ "question": question,
180
+ "answer": sent,
181
+ "url": url
182
+ })
183
+
184
+ # Write results
185
+ json_data = {
186
+ "claim_id": claim_id,
187
+ "claim": claim,
188
+ "evidence": evidence
189
+ }
190
+ output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
191
+ output_file.flush()
192
+
193
+ # batch_time = time.time() - batch_start
194
+ # print(f"Processed example {claim_id}. Time elapsed: {batch_time:.2f}s")
195
+
196
+ # Calculate and display timing information
197
+ # total_time = time.time() - script_start
198
+ # processing_time = time.time() - processing_start
199
+ # end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
200
+
201
+ # print("\nTiming Summary:")
202
+ # print(f"Start time: {start_time}")
203
+ # print(f"End time: {end_time}")
204
+ # print(f"Total runtime: {format_time(total_time)}")
205
+ # print(f"Setup time: {format_time(processing_start - script_start)}")
206
+ # print(f"Processing time: {format_time(processing_time)}")
207
+ # print(f"Results written to: {args.output_questions}")
208
+
209
+ if __name__ == "__main__":
210
+ parser = argparse.ArgumentParser(description="Use a prompt to generate questions that could be answered by top-k retrieved evidence. Output generated questions.")
211
+ parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
212
+ parser.add_argument("--reference_corpus", default="baseline/train.json")
213
+ parser.add_argument(
214
+ "-i",
215
+ "--top_k_target_knowledge",
216
+ default="data_store/dev_reranking_top_k.json",
217
+ help="Directory where the sentences for the scraped data is saved.",
218
+ )
219
+ parser.add_argument(
220
+ "-o",
221
+ "--output_questions",
222
+ default="data_store/dev_top_k_qa.json",
223
+ help="Directory where the sentences for the scraped data is saved.",
224
+ )
225
+ parser.add_argument(
226
+ "--top_k",
227
+ default=10,
228
+ type=int
229
+ )
230
+ parser.add_argument(
231
+ "--batch_size",
232
+ type=int,
233
+ default=4,
234
+ help="Number of examples to process in each batch"
235
+ )
236
+ parser.add_argument(
237
+ "-e",
238
+ "--end",
239
+ type=int,
240
+ default=-1
241
+ )
242
+
243
+ args = parser.parse_args()
244
+ main(args)
system/baseline/reranking_optimized.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gc
4
+ from transformers import AutoModel, AutoTokenizer
5
+ from sentence_transformers import SentenceTransformer
6
+ import numpy as np
7
+ import json
8
+ import argparse
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ import re
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ def encode_text(model, tokenizer, texts, batch_size=8, max_length=512):
16
+ """Encode texts to embeddings using AutoModel"""
17
+ all_embeddings = []
18
+
19
+ for i in range(0, len(texts), batch_size):
20
+ batch = texts[i:i + batch_size]
21
+
22
+ # Tokenize
23
+ encoded_input = tokenizer(
24
+ batch,
25
+ padding=True,
26
+ truncation=True,
27
+ max_length=max_length,
28
+ return_tensors='pt'
29
+ ).to(model.device)
30
+
31
+ # Compute token embeddings
32
+ with torch.no_grad():
33
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
34
+ model_output = model(**encoded_input)
35
+ # Use mean pooling
36
+ attention_mask = encoded_input['attention_mask']
37
+ token_embeddings = model_output[0] # First element contains token embeddings
38
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
39
+ embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
40
+ all_embeddings.append(embeddings.cpu().numpy())
41
+
42
+ # Clear some memory
43
+ if i % (batch_size * 4) == 0:
44
+ torch.cuda.empty_cache()
45
+ gc.collect()
46
+
47
+ return np.vstack(all_embeddings)
48
+
49
+ def compute_similarity(emb1, emb2):
50
+ """Compute cosine similarity between embeddings"""
51
+ return np.dot(emb1, emb2.T) / (
52
+ np.linalg.norm(emb1, axis=1).reshape(-1, 1) *
53
+ np.linalg.norm(emb2, axis=1).reshape(1, -1)
54
+ )
55
+
56
+ def get_detailed_instruct(task_description: str, query: str) -> str:
57
+ return f'Instruct: {task_description}\nQuery: {query}'
58
+
59
+ def preprocess_sentences(sentence1, sentence2):
60
+ vectorizer = TfidfVectorizer().fit_transform([sentence1, sentence2])
61
+ vectors = vectorizer.toarray()
62
+
63
+ cosine_sim = cosine_similarity(vectors)
64
+ similarity_score = cosine_sim[0][1]
65
+ return similarity_score
66
+
67
+ def remove_trailing_special_chars(text):
68
+ return re.sub(r'[\W_]+$', '', text)
69
+
70
+ def remove_special_chars_except_spaces(text):
71
+ return re.sub(r'[^\w\s]+', '', text)
72
+
73
+ def select_top_k(claim, results, top_k):
74
+ '''
75
+ remove sentence of similarity claim
76
+ '''
77
+ dup_check = set()
78
+ top_k_sentences_urls = []
79
+
80
+ i = 0
81
+ # print(results)
82
+ claim = remove_special_chars_except_spaces(claim).lower()
83
+ while len(top_k_sentences_urls) < top_k and i < len(results):
84
+ # print(i)
85
+ sentence = remove_special_chars_except_spaces(results[i]['sentence']).lower()
86
+
87
+ if sentence not in dup_check:
88
+ if preprocess_sentences(claim, sentence) > 0.97:
89
+ dup_check.add(sentence)
90
+ continue
91
+
92
+ if claim in sentence:
93
+ if len(claim) / len(sentence) > 0.92:
94
+ dup_check.add(sentence)
95
+ continue
96
+
97
+ top_k_sentences_urls.append({
98
+ 'sentence': results[i]['sentence'],
99
+ 'url': results[i]['url']}
100
+ )
101
+ i += 1
102
+
103
+ return top_k_sentences_urls
104
+
105
+ # def format_time(seconds):
106
+ # """Format time duration nicely."""
107
+ # return str(timedelta(seconds=round(seconds)))
108
+
109
+
110
+ def compute_embeddings_batched(model, texts, batch_size=8):
111
+ """Compute embeddings in smaller batches to manage memory"""
112
+ all_embeddings = []
113
+ for i in range(0, len(texts), batch_size):
114
+ batch = texts[i:i + batch_size]
115
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16): # Use bfloat16
116
+ emb = model.encode(batch, batch_size=len(batch), show_progress_bar=False)
117
+ all_embeddings.append(emb)
118
+
119
+ # Clear some memory
120
+ if i % (batch_size * 4) == 0:
121
+ torch.cuda.empty_cache()
122
+ gc.collect()
123
+
124
+ return np.vstack(all_embeddings)
125
+
126
+ def main(args):
127
+
128
+
129
+ device = "cuda" if torch.cuda.is_available() else 'cpu'
130
+ print(f"Using device: {device}")
131
+
132
+ # Load model and tokenizer
133
+ model = AutoModel.from_pretrained(
134
+ "Salesforce/SFR-Embedding-2_R",
135
+ torch_dtype=torch.bfloat16,
136
+ low_cpu_mem_usage=True,
137
+ device_map="auto"
138
+ )
139
+ tokenizer = AutoTokenizer.from_pretrained("Salesforce/SFR-Embedding-2_R")
140
+
141
+ # Load target examples
142
+ target_examples = []
143
+ with open(args.target_data, "r", encoding="utf-8") as json_file:
144
+ for i, line in enumerate(json_file):
145
+ try:
146
+ example = json.loads(r"{}".format(line))
147
+ target_examples.append(example)
148
+ except:
149
+ print(f"CURRENT LINE broken {i}")
150
+
151
+ if args.end == -1:
152
+ args.end = len(target_examples)
153
+
154
+ files_to_process = list(range(args.start, args.end))
155
+ total = len(files_to_process)
156
+
157
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
158
+
159
+ with open(args.json_output, "w", encoding="utf-8") as output_json:
160
+ done = 0
161
+ for idx, example in enumerate(target_examples):
162
+ if idx in files_to_process:
163
+ print(f"Processing claim {example['claim_id']}... Progress: {done + 1} / {total}")
164
+
165
+ claim = example['claim']
166
+ query = [get_detailed_instruct(task, claim)] + [
167
+ get_detailed_instruct(task, le)
168
+ for le in example['hypo_fc_docs']
169
+ if len(le.strip()) > 0
170
+ ]
171
+ query_length = len(query)
172
+ sentences = [sent['sentence'] for sent in example[f'top_{5000}']][:args.retrieved_top_k]
173
+
174
+ # st = time.time()
175
+ try:
176
+ # Process query embeddings
177
+ query_embeddings = encode_text(model, tokenizer, query, batch_size=4)
178
+ avg_emb_q = np.mean(query_embeddings, axis=0)
179
+ hyde_vector = avg_emb_q.reshape((1, -1))
180
+
181
+ # Process sentence embeddings in smaller chunks
182
+ sentence_embeddings = encode_text(
183
+ model,
184
+ tokenizer,
185
+ sentences,
186
+ batch_size=args.batch_size
187
+ )
188
+
189
+ # Compute similarities in chunks to save memory
190
+ chunk_size = 1000
191
+ all_scores = []
192
+ for i in range(0, len(sentence_embeddings), chunk_size):
193
+ chunk = sentence_embeddings[i:i + chunk_size]
194
+ chunk_scores = compute_similarity(hyde_vector, chunk)[0]
195
+ all_scores.extend(chunk_scores)
196
+
197
+ scores = np.array(all_scores)
198
+ top_k_idx = np.argsort(scores)[::-1]
199
+ results = [example['top_5000'][i] for i in top_k_idx]
200
+ top_k_sentences_urls = select_top_k(claim, results, args.top_k)
201
+
202
+ # print(f"Top {args.top_k} retrieved. Time elapsed: {time.time() - st:.2f}s")
203
+
204
+ json_data = {
205
+ "claim_id": example['claim_id'],
206
+ "claim": claim,
207
+ f"top_{args.top_k}": top_k_sentences_urls
208
+ }
209
+ output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
210
+ output_json.flush()
211
+
212
+ except RuntimeError as e:
213
+ print(f"Error processing claim {example['claim_id']}: {e}")
214
+ continue
215
+
216
+ done += 1
217
+
218
+
219
+ if __name__ == "__main__":
220
+ parser = argparse.ArgumentParser()
221
+ parser.add_argument("--target_data", default="data_store/dev_retrieval_top_k.json")
222
+ parser.add_argument("--retrieved_top_k", type=int, default=5000)
223
+ parser.add_argument("--top_k", type=int, default=10)
224
+ parser.add_argument("-o", "--json_output", type=str, default="data_store/dev_reranking_top_k.json")
225
+ parser.add_argument("--batch_size", type=int, default=32)
226
+ parser.add_argument("-s", "--start", type=int, default=0)
227
+ parser.add_argument("-e", "--end", type=int, default=-1)
228
+ args = parser.parse_args()
229
+
230
+ main(args)
system/baseline/retrieval_optimized.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import pandas as pd
7
+ import nltk
8
+ from rank_bm25 import BM25Okapi
9
+ from multiprocessing import Pool, cpu_count, Manager, Lock
10
+ from functools import partial
11
+ import heapq
12
+ from threading import Thread, Event
13
+ import queue
14
+ from datetime import datetime, timedelta
15
+
16
+
17
+ def download_nltk_data(package_name, download_dir='nltk_data'):
18
+ # Ensure the download directory exists
19
+ os.makedirs(download_dir, exist_ok=True)
20
+
21
+ # Set NLTK data path
22
+ nltk.data.path.append(download_dir)
23
+
24
+ try:
25
+ # Try to find the resource
26
+ nltk.data.find(f'tokenizers/{package_name}')
27
+ print(f"Package '{package_name}' is already downloaded")
28
+ except LookupError:
29
+ # If resource isn't found, download it
30
+ print(f"Downloading {package_name}...")
31
+ nltk.download(package_name, download_dir=download_dir)
32
+ print(f"Successfully downloaded {package_name}")
33
+
34
+
35
+ def combine_all_sentences(knowledge_file):
36
+ sentences, urls = [], []
37
+
38
+ with open(knowledge_file, "r", encoding="utf-8") as json_file:
39
+ for i, line in enumerate(json_file):
40
+ data = json.loads(line)
41
+ sentences.extend(data["url2text"])
42
+ urls.extend([data["url"] for _ in range(len(data["url2text"]))])
43
+ return sentences, urls, i + 1
44
+
45
+ def remove_duplicates(sentences, urls):
46
+ df = pd.DataFrame({"document_in_sentences":sentences, "sentence_urls":urls})
47
+ df['sentences'] = df['document_in_sentences'].str.strip().str.lower()
48
+ df = df.drop_duplicates(subset="sentences").reset_index()
49
+ return df['document_in_sentences'].tolist(), df['sentence_urls'].tolist()
50
+
51
+ def retrieve_top_k_sentences(query, document, urls, top_k):
52
+ tokenized_docs = [nltk.word_tokenize(doc) for doc in document[:top_k]]
53
+ bm25 = BM25Okapi(tokenized_docs)
54
+
55
+ scores = bm25.get_scores(nltk.word_tokenize(query))
56
+ top_k_idx = np.argsort(scores)[::-1][:top_k]
57
+
58
+ return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]
59
+
60
+ def process_single_example(idx, example, args, result_queue, counter, lock):
61
+ try:
62
+ with lock:
63
+ current_count = counter.value + 1
64
+ counter.value = current_count
65
+ print(f"\nProcessing claim {idx}... Progress: {current_count} / {args.total_examples}")
66
+
67
+ # start_time = time.time()
68
+
69
+ document_in_sentences, sentence_urls, num_urls_this_claim = combine_all_sentences(
70
+ os.path.join(args.knowledge_store_dir, f"{idx}.jsonl")
71
+ )
72
+
73
+ print(f"Obtained {len(document_in_sentences)} sentences from {num_urls_this_claim} urls.")
74
+
75
+ document_in_sentences, sentence_urls = remove_duplicates(document_in_sentences, sentence_urls)
76
+
77
+ query = example["claim"] + " " + " ".join(example['hypo_fc_docs'])
78
+ top_k_sentences, top_k_urls = retrieve_top_k_sentences(
79
+ query, document_in_sentences, sentence_urls, args.top_k
80
+ )
81
+
82
+
83
+ result = {
84
+ "claim_id": idx,
85
+ "claim": example["claim"],
86
+ f"top_{args.top_k}": [
87
+ {"sentence": sent, "url": url}
88
+ for sent, url in zip(top_k_sentences, top_k_urls)
89
+ ],
90
+ "hypo_fc_docs": example['hypo_fc_docs']
91
+ }
92
+
93
+ result_queue.put((idx, result))
94
+ return True
95
+ except Exception as e:
96
+ print(f"Error processing example {idx}: {str(e)}")
97
+ result_queue.put((idx, None))
98
+ return False
99
+
100
+ def writer_thread(output_file, result_queue, total_examples, stop_event):
101
+ next_index = 0
102
+ pending_results = []
103
+
104
+ with open(output_file, "w", encoding="utf-8") as f:
105
+ while not (stop_event.is_set() and result_queue.empty()):
106
+ try:
107
+ idx, result = result_queue.get(timeout=1)
108
+
109
+ if result is not None:
110
+ heapq.heappush(pending_results, (idx, result))
111
+
112
+ while pending_results and pending_results[0][0] == next_index:
113
+ _, result_to_write = heapq.heappop(pending_results)
114
+ f.write(json.dumps(result_to_write, ensure_ascii=False) + "\n")
115
+ f.flush()
116
+ next_index += 1
117
+
118
+ except queue.Empty:
119
+ continue
120
+
121
+ # def format_time(seconds):
122
+ # """Format time duration nicely."""
123
+ # return str(timedelta(seconds=round(seconds)))
124
+
125
+ def main(args):
126
+
127
+
128
+
129
+ download_nltk_data('punkt')
130
+ download_nltk_data('punkt_tab')
131
+
132
+ with open(args.target_data, "r", encoding="utf-8") as json_file:
133
+ target_examples = json.load(json_file)
134
+
135
+ if args.end == -1:
136
+ args.end = len(target_examples)
137
+
138
+ print(f"Total examples to process: {args.end - args.start}")
139
+
140
+ files_to_process = list(range(args.start, args.end))
141
+ examples_to_process = [(idx, target_examples[idx]) for idx in files_to_process]
142
+
143
+ num_workers = min(args.workers if args.workers > 0 else cpu_count(), len(files_to_process))
144
+ print(f"Using {num_workers} workers to process {len(files_to_process)} examples")
145
+
146
+ with Manager() as manager:
147
+ counter = manager.Value('i', 0)
148
+ lock = manager.Lock()
149
+ args.total_examples = len(files_to_process)
150
+
151
+ result_queue = manager.Queue()
152
+
153
+ stop_event = Event()
154
+ writer = Thread(
155
+ target=writer_thread,
156
+ args=(args.json_output, result_queue, len(files_to_process), stop_event)
157
+ )
158
+ writer.start()
159
+
160
+ process_func = partial(
161
+ process_single_example,
162
+ args=args,
163
+ result_queue=result_queue,
164
+ counter=counter,
165
+ lock=lock
166
+ )
167
+
168
+ with Pool(num_workers) as pool:
169
+ results = pool.starmap(process_func, examples_to_process)
170
+
171
+ stop_event.set()
172
+ writer.join()
173
+
174
+ # successful = sum(1 for r in results if r)
175
+ # print(f"\nSuccessfully processed {successful} out of {len(files_to_process)} examples")
176
+ # print(f"Results written to {args.json_output}")
177
+
178
+ # # Calculate and display timing information
179
+ # total_time = time.time() - script_start
180
+ # avg_time = total_time / len(files_to_process)
181
+ # end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
182
+
183
+ # print("\nTiming Summary:")
184
+ # print(f"Start time: {start_time}")
185
+ # print(f"End time: {end_time}")
186
+ # print(f"Total runtime: {format_time(total_time)} (HH:MM:SS)")
187
+ # print(f"Average time per example: {avg_time:.2f} seconds")
188
+ # if successful > 0:
189
+ # print(f"Processing speed: {successful / total_time:.2f} examples per second")
190
+
191
+ if __name__ == "__main__":
192
+ parser = argparse.ArgumentParser(
193
+ description="Get top 10000 sentences with BM25 in the knowledge store using parallel processing."
194
+ )
195
+ parser.add_argument(
196
+ "-k",
197
+ "--knowledge_store_dir",
198
+ type=str,
199
+ default="data_store/knowledge_store",
200
+ help="The path of the knowledge_store_dir containing json files with all the retrieved sentences.",
201
+ )
202
+ parser.add_argument(
203
+ "--target_data",
204
+ type=str,
205
+ default="data_store/hyde_fc.json",
206
+ help="The path of the file that stores the claim.",
207
+ )
208
+ parser.add_argument(
209
+ "-o",
210
+ "--json_output",
211
+ type=str,
212
+ default="data_store/dev_retrieval_top_k.json",
213
+ help="The output dir for JSON files to save the top 100 sentences for each claim.",
214
+ )
215
+ parser.add_argument(
216
+ "--top_k",
217
+ default=5000,
218
+ type=int,
219
+ help="How many documents should we pick out with BM25.",
220
+ )
221
+ parser.add_argument(
222
+ "-s",
223
+ "--start",
224
+ type=int,
225
+ default=0,
226
+ help="Starting index of the files to process.",
227
+ )
228
+ parser.add_argument(
229
+ "-e",
230
+ "--end",
231
+ type=int,
232
+ default=-1,
233
+ help="End index of the files to process.",
234
+ )
235
+ parser.add_argument(
236
+ "-w",
237
+ "--workers",
238
+ type=int,
239
+ default=0,
240
+ help="Number of worker processes (default: number of CPU cores)",
241
+ )
242
+
243
+ args = parser.parse_args()
244
+ main(args)
system/baseline/train.json ADDED
The diff for this file is too large to render. See raw diff
 
system/date_verifier.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import re, trafilatura
3
+ from trafilatura.settings import DEFAULT_CONFIG
4
+
5
+ DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
6
+
7
+ _URL_DATE_PATS = [
8
+ re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
9
+ re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
10
+ re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
11
+ ]
12
+
13
+ def _meta_date(url: str):
14
+
15
+ page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
16
+ if not page:
17
+ return None
18
+ meta = trafilatura.extract_metadata(page)
19
+ if not meta or not meta.date:
20
+ return None
21
+ try:
22
+ return datetime.fromisoformat(meta.date)
23
+ except ValueError:
24
+
25
+ try:
26
+ return datetime.fromisoformat(meta.date.split("T")[0])
27
+ except Exception:
28
+ return None
29
+
30
+ def _regex_date(url: str):
31
+
32
+ for pat in _URL_DATE_PATS:
33
+ m = pat.search(url)
34
+ if m:
35
+ try:
36
+ return datetime(
37
+ int(m.group("y")), int(m.group("m")), int(m.group("d"))
38
+ )
39
+ except ValueError:
40
+ pass
41
+ return None
42
+
43
+
44
+ def is_after_start(url: str, start_ymd: str) -> bool:
45
+ """
46
+ - start_ymd: 'YYYYMMDD'
47
+ """
48
+ t0 = datetime.strptime(start_ymd, "%Y%m%d")
49
+
50
+ pub_dt = _meta_date(url)
51
+
52
+ if pub_dt is None:
53
+ pub_dt = _regex_date(url)
54
+
55
+ if pub_dt is None:
56
+ return True
57
+
58
+ return pub_dt >= t0
system/ee.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from tqdm import tqdm
5
+ import tiktoken
6
+ from openai import OpenAI
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ def gpt_4o(input_text):
10
+ client=OpenAI(api_key=os.environ.get("OAI"))
11
+ response = client.chat.completions.create(
12
+ model="gpt-4o",
13
+ messages=[
14
+ {"role": "user", "content": [{"type": "text", "text": input_text}]}
15
+ ],
16
+ response_format={"type": "json_object"},
17
+ temperature=0,
18
+ max_tokens=4096,
19
+ top_p=0,
20
+ frequency_penalty=0,
21
+ presence_penalty=0
22
+ )
23
+ return response.choices[0].message.content
24
+
25
+ def run_gpt4_event_extraction(data_dir, max_tokens=100000):
26
+
27
+ all_info_path = os.path.join(data_dir, "all_info_with_txt.json")
28
+ output_dir = os.path.join(data_dir, "gpt4_event_extraction")
29
+ os.makedirs(output_dir, exist_ok=True)
30
+ icl_path = hf_hub_download(
31
+ repo_id="PledgeTracker/demo_feedback",
32
+ filename="icl.txt",
33
+ repo_type="dataset",
34
+ token=os.environ["HF_TOKEN"]
35
+ )
36
+ ICL = open(icl_path, "r").read()
37
+ all_info = open(all_info_path, "r").readlines()
38
+
39
+ enc = tiktoken.encoding_for_model("gpt-4o")
40
+
41
+ for i, line in enumerate(all_info):
42
+ ID = i
43
+ urls = []
44
+ results = []
45
+
46
+ data = json.loads(line)
47
+ docs = data["evidence"]
48
+ claim = data["claim"]
49
+
50
+ output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json")
51
+ if os.path.exists(output_path):
52
+ print(f"Already exist: {output_path}")
53
+
54
+ else:
55
+
56
+ for doc in tqdm(docs):
57
+ if doc["url"] in urls:
58
+ continue
59
+
60
+ text = " ".join(doc["text"])
61
+ input_text = (
62
+ f"{ICL}\nNow please only summarize events that are useful for verifying the pledge '{claim}', and their dates in the JSON format.\n\nInput:\n\nTitle: {doc['metadata']['title']}\n"
63
+ f"Date: {doc['metadata']['date']}\nArticle: {text}\nPledge: {claim}\n\n"
64
+ f"Output:\n"
65
+ )
66
+
67
+ urls.append(doc["url"])
68
+ text_tokens = enc.encode(input_text)
69
+ if len(text_tokens) > max_tokens:
70
+ input_text = enc.decode(text_tokens[:max_tokens])
71
+
72
+ try:
73
+ output = gpt_4o(input_text)
74
+ # print(f"GPT-4o Response: {output}")
75
+ results.append({
76
+ "url": doc["url"],
77
+ "title": doc["metadata"]["title"],
78
+ "date": doc["metadata"]["date"],
79
+ "article": text,
80
+ "output": json.loads(output)
81
+ })
82
+ except Exception as e:
83
+ print(f"Error processing doc: {e}")
84
+ continue
85
+
86
+
87
+ with open(output_path, "w", encoding="utf-8") as f:
88
+ json.dump(results, f, ensure_ascii=False, indent=4)
89
+
90
+ return output_path
91
+
92
+ if __name__ == "__main__":
93
+ parser = argparse.ArgumentParser(description="Run GPT-4o event extraction")
94
+ parser.add_argument("--data_dir", type=str, required=True, help="Root data directory")
95
+ parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file")
96
+ parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input")
97
+
98
+ args = parser.parse_args()
99
+
100
+ run_gpt4_event_extraction(
101
+ base_dir=args.base_dir,
102
+ icl_path=args.icl_path,
103
+ max_tokens=args.max_tokens
104
+ )
system/generate_output.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from system.html2lines import html2metadata
5
+ from lxml.etree import tostring
6
+ import lxml.etree
7
+
8
+ def process_manifesto_data_with_metadata(input_base_dir: str):
9
+
10
+ input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
11
+ output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")
12
+
13
+ url2text_dir = os.path.join(input_base_dir, "augmented_data_store")
14
+
15
+ with open(input_file_path, "r", encoding="utf-8") as f:
16
+ input_file = f.readlines()
17
+
18
+ out_file = open(output_file_path, "w", encoding="utf-8")
19
+
20
+
21
+ i = 0
22
+
23
+ for id, line in enumerate(input_file):
24
+ line = json.loads(line)
25
+ claim = line["claim"]
26
+ QAs = line["top_50"]
27
+ new_line = {"claim": claim, "evidence": []}
28
+
29
+ json_path = os.path.join(url2text_dir, f"{id}.jsonl")
30
+ if not os.path.exists(json_path):
31
+ print(f"Warning: {json_path} not found")
32
+ continue
33
+
34
+ with open(json_path, "r", encoding="utf-8") as f:
35
+ try:
36
+ data_store = json.load(f)
37
+ except json.JSONDecodeError:
38
+ f.seek(0)
39
+ data_store = [json.loads(line) for line in f]
40
+
41
+ url_txt = {data["url"]: data["url2text"] for data in data_store}
42
+
43
+ URLs = []
44
+ for j, QA in enumerate(QAs):
45
+ newQA = QA.copy()
46
+ URL = QA["url"]
47
+ newQA["text"] = url_txt.get(URL, "")
48
+
49
+ if URL not in URLs:
50
+ try:
51
+ meta = html2metadata(URL)
52
+ if isinstance(meta, lxml.etree._Element):
53
+ meta = tostring(meta, encoding="unicode", pretty_print=True)
54
+ meta_save = {
55
+ "title": meta["title"],
56
+ "date": meta["date"]
57
+ }
58
+ except Exception as e:
59
+ print(f"Metadata extraction failed for URL: {URL}, error: {e}")
60
+ meta_save = {
61
+ "title": "",
62
+ "date": ""
63
+ }
64
+
65
+
66
+ newQA["metadata"] = meta_save
67
+ new_line["evidence"].append(newQA)
68
+
69
+ out_file.write(json.dumps(new_line) + "\n")
70
+
71
+ out_file.close()
72
+ return output_file_path
73
+
74
+
75
+
system/hero_pipeline.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ import subprocess
4
+ from huggingface_hub import hf_hub_download
5
+ import json
6
+
7
+ def run_hero_reranking(pipeline_base_dir, suggestion_meta):
8
+ base_dir = f"{pipeline_base_dir}"
9
+ hero_dir = os.path.join(base_dir, "hero")
10
+ os.makedirs(hero_dir, exist_ok=True)
11
+
12
+ if suggestion_meta:
13
+ hyde_path = hf_hub_download(
14
+ repo_id="PledgeTracker/demo_feedback",
15
+ filename="manifesto_icl_hyde_fc.json",
16
+ repo_type="dataset",
17
+ token=os.environ["HF_TOKEN"]
18
+ )
19
+ with open(hyde_path, "r", encoding="utf-8") as f:
20
+ all_hyde_data = json.load(f)
21
+
22
+ idx = suggestion_meta["index"]
23
+ single_hyde = [all_hyde_data[idx]]
24
+ save_path = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
25
+ with open(save_path, "w", encoding="utf-8") as f:
26
+ json.dump(single_hyde, f, indent=2)
27
+
28
+ hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
29
+
30
+ def safe_run(cmd, timeout=600):
31
+ try:
32
+ print(f"πŸ‘‰ Running: {' '.join(str(x) for x in cmd)}")
33
+ subprocess.run(cmd, check=True, timeout=timeout)
34
+ except subprocess.CalledProcessError as e:
35
+ print(f"[❌ ERROR] Subprocess failed: {e}")
36
+ if e.stderr:
37
+ print("[stderr]:", e.stderr.decode())
38
+ raise
39
+ except subprocess.TimeoutExpired:
40
+ print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
41
+ raise
42
+
43
+ # Step 3.2: retrieval
44
+ print("πŸ” Step 3.2: Retrieval from knowledge store ...")
45
+ knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
46
+ retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")
47
+
48
+ if not os.path.exists(retrieval_output):
49
+ safe_run([
50
+ "python", "system/baseline/retrieval_optimized.py",
51
+ "--knowledge_store_dir", knowledge_store_dir,
52
+ "--target_data", hyde_output,
53
+ "--json_output", retrieval_output,
54
+ ])
55
+
56
+ # Step 3.3: reranking
57
+ print("🏷️ Step 3.3: Reranking retrieved evidence ...")
58
+ rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")
59
+
60
+ if not os.path.exists(rerank_output):
61
+ safe_run([
62
+ "python", "system/baseline/reranking_optimized.py",
63
+ "--target_data", retrieval_output,
64
+ "--json_output", rerank_output,
65
+ "--top_k", str(50),
66
+ ])
67
+
68
+ return {
69
+ "hyde": hyde_output,
70
+ "retrieved": retrieval_output,
71
+ "reranked": rerank_output,
72
+ }
73
+
74
+
75
+ def run_hero_pipeline(pipeline_base_dir):
76
+ base_dir = f"{pipeline_base_dir}"
77
+ hero_dir = os.path.join(base_dir, "hero")
78
+ os.makedirs(hero_dir, exist_ok=True)
79
+
80
+ target_data = os.path.join(base_dir, "claim.json")
81
+ hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
82
+
83
+ def safe_run(cmd, timeout=600):
84
+ try:
85
+ print(f"πŸ‘‰ Running: {' '.join(cmd)}")
86
+ subprocess.run(cmd, check=True, timeout=timeout)
87
+ except subprocess.CalledProcessError as e:
88
+ print(f"[❌ ERROR] Subprocess failed: {e}")
89
+ if e.stderr:
90
+ print("[stderr]:", e.stderr.decode())
91
+ raise
92
+ except subprocess.TimeoutExpired:
93
+ print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
94
+ raise
95
+
96
+ # Step 3.1: hyde_fc_generation
97
+ if not os.path.exists(hyde_output):
98
+ print("🧠 Step 3.1: HyDE ICL generation ...")
99
+ safe_run([
100
+ "python", "system/baseline/hyde_fc_generation_optimized.py",
101
+ "--target_data", target_data,
102
+ "--json_output", hyde_output
103
+ ])
104
+
105
+ # Step 3.2: retrieval
106
+ print("πŸ” Step 3.2: Retrieval from knowledge store ...")
107
+ knowledge_store_dir = os.path.join(base_dir, "initial_data_store")
108
+ retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k.json")
109
+
110
+ if not os.path.exists(retrieval_output):
111
+ safe_run([
112
+ "python", "system/baseline/retrieval_optimized.py",
113
+ "--knowledge_store_dir", knowledge_store_dir,
114
+ "--target_data", hyde_output,
115
+ "--json_output", retrieval_output
116
+ ])
117
+
118
+ # Step 3.3: reranking
119
+ print("🏷️ Step 3.3: Reranking retrieved evidence ...")
120
+ rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k.json")
121
+
122
+ if not os.path.exists(rerank_output):
123
+ safe_run([
124
+ "python", "system/baseline/reranking_optimized.py",
125
+ "--target_data", retrieval_output,
126
+ "--json_output", rerank_output
127
+ ])
128
+
129
+ # Step 3.4: question generation
130
+ print("❓ Step 3.4: Generating QA pairs ...")
131
+ reference_corpus = "system/baseline/train.json"
132
+ qa_output = os.path.join(hero_dir, "manifesto_icl_top_k_qa.json")
133
+
134
+ if not os.path.exists(qa_output):
135
+ safe_run([
136
+ "python", "system/baseline/question_generation_optimized.py",
137
+ "--reference_corpus", reference_corpus,
138
+ "--top_k_target_knowledge", rerank_output,
139
+ "--output_questions", qa_output,
140
+ "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct"
141
+ ])
142
+
143
+ return {
144
+ "hyde": hyde_output,
145
+ "retrieved": retrieval_output,
146
+ "reranked": rerank_output,
147
+ "qa_pairs": qa_output
148
+ }
149
+
150
+
system/html2lines.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from time import sleep
3
+ import trafilatura
4
+ from trafilatura.meta import reset_caches
5
+ from trafilatura.settings import DEFAULT_CONFIG
6
+ import spacy
7
+ from lxml.etree import tostring
8
+ import lxml.etree
9
+
10
+
11
+ import spacy
12
+ import subprocess
13
+
14
+ try:
15
+ nlp = spacy.load("en_core_web_lg")
16
+ except OSError:
17
+ print("πŸ” Downloading spaCy model 'en_core_web_lg' ...")
18
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
19
+ nlp = spacy.load("en_core_web_lg")
20
+
21
+
22
+ DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
23
+ MIN_CHAR = 50
24
+ MAX_CHAR = 5000
25
+
26
+
27
+ def get_page(url):
28
+ page = None
29
+ for _ in range(3):
30
+ try:
31
+ page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
32
+ assert page is not None
33
+ print("Fetched " + url, file=sys.stderr)
34
+ break
35
+ except:
36
+ sleep(3)
37
+ return page
38
+
39
+
40
+ def url2lines(url):
41
+ page = get_page(url)
42
+
43
+ if page is None:
44
+ return []
45
+
46
+ lines = html2lines(page)
47
+ return lines
48
+
49
+
50
+ def line_correction(lines, max_size=100):
51
+ out_lines = []
52
+ for line in lines:
53
+ if len(line) < MIN_CHAR:
54
+ continue
55
+
56
+ if len(line) > max_size:
57
+ doc = nlp(
58
+ line[:MAX_CHAR]
59
+ ) # We split lines into sentences, but for performance we take only the first 5k characters per line
60
+ stack = ""
61
+ for sent in doc.sents:
62
+ if len(stack) > 0:
63
+ stack += " "
64
+ stack += str(sent).strip()
65
+ if len(stack) > max_size:
66
+ out_lines.append(stack)
67
+ stack = ""
68
+
69
+ if (
70
+ len(stack) > MIN_CHAR
71
+ ): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
72
+ out_lines.append(stack)
73
+ else:
74
+ out_lines.append(line)
75
+
76
+ return out_lines
77
+
78
+
79
+ def html2lines(page):
80
+ out_lines = []
81
+
82
+ if len(page.strip()) == 0 or page is None:
83
+ return out_lines
84
+
85
+ text = trafilatura.extract(page, config=DEFAULT_CONFIG)
86
+ reset_caches()
87
+
88
+ if text is None:
89
+ return out_lines
90
+
91
+ return text.split(
92
+ "\n"
93
+ ) # We just spit out the entire page, so need to reformat later.
94
+
95
+
96
+ def html2metadata(url):
97
+ page = get_page(url)
98
+ metadata = trafilatura.extract_metadata(page)
99
+ return metadata.as_dict()
100
+
101
+ if __name__ == "__main__":
102
+ url = "https://www.bbc.co.uk/news/61407508"
103
+ metadata = html2metadata(url)
104
+ text = " ".join(html2lines(page))
105
+ print(metadata)
system/initial_searching.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ import spacy
9
+ import subprocess
10
+
11
+ try:
12
+ nlp = spacy.load("en_core_web_sm")
13
+ except OSError:
14
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ def clean_keywords(text):
18
+ doc = nlp(text)
19
+ keywords = []
20
+ for chunk in doc.noun_chunks:
21
+ words = [token.text for token in chunk if not token.is_stop and token.is_alpha]
22
+ if words:
23
+ cleaned_phrase = " ".join(words)
24
+ if len(cleaned_phrase) > 2:
25
+ keywords.append(cleaned_phrase)
26
+ return list(set(keywords))
27
+
28
+ def google_search(query, api_key, search_engine_id, start_date, end_date):
29
+ print(f"[SYSTEM] Calling Google Search API for: {query}")
30
+ sort = f"date:r:{start_date}:{end_date}"
31
+ url = "https://www.googleapis.com/customsearch/v1"
32
+ params = {
33
+ "q": query,
34
+ "key": api_key,
35
+ "cx": search_engine_id,
36
+ "num": 10,
37
+ "sort": sort,
38
+ "cr": "countryUK",
39
+ "gl": "uk"
40
+ }
41
+ try:
42
+ response = requests.get(url, params=params)
43
+ response.raise_for_status()
44
+ return response.json().get("items", [])
45
+ except Exception as e:
46
+ print(f"[ERROR] Google Search Failed: {e}")
47
+ return []
48
+
49
+ def save_tsv(file_path, claim_id, claim_text, url_list):
50
+ df = pd.DataFrame({
51
+ 'ID': [claim_id] * len(url_list),
52
+ 'String': ["claim"] * len(url_list),
53
+ 'ListValue': url_list,
54
+ 'query': [claim_text] * len(url_list)
55
+ })
56
+ df.to_csv(file_path, sep='\t', index=False, header=False)
57
+
58
+ def ensure_directory_exists(path):
59
+ dir_path = Path(path).expanduser().resolve().parent
60
+ if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
61
+ raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
62
+ dir_path.mkdir(parents=True, exist_ok=True)
63
+
64
+ def run_initial_searching(claim_text, pipeline_base_dir, start_date, end_date, user_id, claim_id):
65
+ api_key = os.environ.get("GOOGLE_API_KEY")
66
+ search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
67
+ if not api_key or not search_engine_id:
68
+ raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
69
+
70
+ base_dir = pipeline_base_dir
71
+ manifesto_json_file = os.path.join(base_dir,"claim.json")
72
+ tsv_file_path = os.path.join(base_dir,"initial_search_results.tsv")
73
+
74
+ ensure_directory_exists(tsv_file_path)
75
+
76
+ claim_record = {"claim_id": claim_id, "claim": claim_text}
77
+ # if manifesto_json_file.exists():
78
+ # with open(manifesto_json_file, "r") as f:
79
+ # records = json.load(f)
80
+ # else:
81
+ records = []
82
+ records.append(claim_record)
83
+ with open(manifesto_json_file, "w") as f:
84
+ json.dump(records, f, indent=1)
85
+
86
+ urls = []
87
+ results = google_search(f"{claim_text}", api_key, search_engine_id, start_date, end_date)
88
+ urls += [r["link"] for r in results if "link" in r]
89
+ keywords = clean_keywords(claim_text)
90
+ keyword_text = " ".join(keywords)
91
+ # for kw in keywords:
92
+ # results = google_search(kw, api_key, search_engine_id, start_date, end_date)
93
+ # urls += [r["link"] for r in results if "link" in r]
94
+ results = google_search(keyword_text, api_key, search_engine_id, start_date, end_date)
95
+ urls += [r["link"] for r in results if "link" in r]
96
+ urls = list(dict.fromkeys(urls))
97
+
98
+ save_tsv(str(tsv_file_path), claim_id, claim_text, urls)
99
+ print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
100
+ return str(tsv_file_path), str(manifesto_json_file)
system/pledge_tracking.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from datetime import datetime
3
+ import os, time
4
+ import pandas as pd
5
+
6
+ from system.initial_searching import run_initial_searching
7
+ from system.scraper import run_scraper
8
+ from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
9
+ from system.augmented_searching import run_augmented_searching
10
+ from system.generate_output import process_manifesto_data_with_metadata
11
+ from system.ee import run_gpt4_event_extraction
12
+ from system.process_time import extract_and_sort_events
13
+ import spacy
14
+ import subprocess
15
+ from huggingface_hub import hf_hub_download
16
+ import json
17
+
18
+ try:
19
+ spacy.load("en_core_web_sm")
20
+ except OSError:
21
+ print("πŸ” Downloading en_core_web_sm model ...")
22
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
23
+ nlp = spacy.load("en_core_web_sm")
24
+
25
+
26
+ def count_total_events(output_path):
27
+ with open(output_path, "r", encoding="utf-8") as f:
28
+ results = json.load(f)
29
+
30
+ total_events = 0
31
+ for result in results:
32
+ total_events+= len(result["output"]["events"])
33
+
34
+ print(f"{total_events} events in total")
35
+ return total_events
36
+
37
+ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
38
+ pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
39
+ os.makedirs(pipeline_base_dir, exist_ok=True)
40
+
41
+ step_id=1
42
+
43
+ # Step 1: Google 搜紒
44
+ if suggestion_meta==None:
45
+
46
+
47
+ print("πŸ” Step 1: Initial searching ...")
48
+ initial_tsv_file, claim_json_path = run_initial_searching(
49
+ claim_text=f"{pledge_author} : {claim} ({pledge_date})",
50
+ # pledge_author=pledge_author,
51
+ pipeline_base_dir=pipeline_base_dir,
52
+ start_date=start_date,
53
+ end_date="",
54
+ user_id=user_id,
55
+ claim_id=0,
56
+ )
57
+ with open(initial_tsv_file, "r", encoding="utf-8") as f:
58
+ line_count = sum(1 for line in f)
59
+ if update_fn:
60
+ update_fn(step_id, f"{line_count} URLs are retrieved")
61
+ step_id+=1
62
+
63
+
64
+ print("🌐 Step 2: Scraping URLs ...")
65
+ initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
66
+ os.makedirs(initial_data_store_dir, exist_ok=True)
67
+ initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
68
+ run_scraper(initial_tsv_file, initial_scraped_output_path)
69
+
70
+ with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
71
+ line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
72
+ if update_fn:
73
+ update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
74
+ step_id+=1
75
+
76
+
77
+ print("🧠 Step 3: HerO processing ...")
78
+ hero_output_dir = os.path.join(pipeline_base_dir, "hero")
79
+ os.makedirs(hero_output_dir, exist_ok=True)
80
+ run_hero_pipeline(pipeline_base_dir)
81
+
82
+ qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")
83
+
84
+ with open(qa_file_path, "r", encoding="utf-8") as f:
85
+ questions = {line["question"] for line in json.load(f)["evidence"]}
86
+ questions = list(questions)
87
+ line_count = len(questions)
88
+ if update_fn:
89
+ update_fn(step_id, f"{line_count} relevant queries are generated, for example:\n"
90
+ f"&nbsp;&nbsp;&nbsp;&nbsp;1. {questions[0]}\n"
91
+ f"&nbsp;&nbsp;&nbsp;&nbsp;2. {questions[1]}\n"
92
+ f"&nbsp;&nbsp;&nbsp;&nbsp;3. {questions[2]}\n"
93
+ f"&nbsp;&nbsp;&nbsp;&nbsp;4. {questions[3]}\n"
94
+ f"&nbsp;&nbsp;&nbsp;&nbsp;5. {questions[4]}")
95
+ step_id+=1
96
+
97
+ else:
98
+ claim_json_path = None
99
+ initial_scraped_output_path = None
100
+ initial_tsv_file = None
101
+ hero_output_dir = None
102
+ qa_file_path = hf_hub_download(
103
+ repo_id="PledgeTracker/demo_feedback",
104
+ filename="manifesto_with_QA_icl_top_k_qa.json",
105
+ repo_type="dataset",
106
+ token=os.environ["HF_TOKEN"]
107
+ )
108
+ idx = suggestion_meta["index"]
109
+ qa_lines = open(f"{qa_file_path}","r").readlines()[idx]
110
+ questions = {line["question"] for line in json.loads(qa_lines)["evidence"]}
111
+ questions = list(questions)
112
+ line_count = len(questions)
113
+ if update_fn:
114
+ update_fn(step_id, f"relevant queries are generated, for example:\n"
115
+ f"&nbsp;&nbsp;&nbsp;&nbsp;1. {questions[0]}\n"
116
+ f"&nbsp;&nbsp;&nbsp;&nbsp;2. {questions[1]}\n"
117
+ f"&nbsp;&nbsp;&nbsp;&nbsp;3. {questions[2]}\n"
118
+ f"&nbsp;&nbsp;&nbsp;&nbsp;4. {questions[3]}\n"
119
+ f"&nbsp;&nbsp;&nbsp;&nbsp;5. {questions[4]}")
120
+ step_id+=1
121
+
122
+ try:
123
+ augmented_tsv_file = run_augmented_searching(
124
+ qa_file=qa_file_path,
125
+ pledge_author=pledge_author,
126
+ pledge_date=pledge_date,
127
+ pipeline_base_dir=pipeline_base_dir,
128
+ start_date=start_date,
129
+ suggestion_meta=suggestion_meta,
130
+ end_date="",
131
+ )
132
+
133
+
134
+
135
+ with open(augmented_tsv_file, "r", encoding="utf-8") as f:
136
+ line_count = sum(1 for line in f)
137
+ if update_fn:
138
+ update_fn(step_id, f"{line_count} URLs are retrieved")
139
+ step_id+=1
140
+ except Exception as e:
141
+ if update_fn:
142
+ update_fn(step_id, f"❌ run_augmented_searching failed: {e}")
143
+ raise
144
+
145
+ augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
146
+ os.makedirs(augmented_data_store_dir, exist_ok=True)
147
+
148
+ try:
149
+ augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
150
+ run_scraper(augmented_tsv_file, augmented_scraped_output_path)
151
+
152
+ with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
153
+ line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
154
+ if update_fn:
155
+ update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
156
+ step_id+=1
157
+ except Exception as e:
158
+ if update_fn:
159
+ update_fn(step_id, f"❌ run_scraper failed: {e}")
160
+ raise
161
+
162
+
163
+ try:
164
+ run_hero_reranking(pipeline_base_dir, suggestion_meta)
165
+
166
+ meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
167
+ all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
168
+ unique_urls = set()
169
+ with open(all_info_path, "r", encoding="utf-8") as f:
170
+ for line in f:
171
+ data = json.loads(line)
172
+ docs = data.get("evidence", [])
173
+ for doc in docs:
174
+ if "url" in doc:
175
+ unique_urls.add(doc["url"])
176
+ if update_fn:
177
+ update_fn(step_id, f"{len(unique_urls)} documents are selected")
178
+ step_id+=1
179
+ except Exception as e:
180
+ if update_fn:
181
+ update_fn(step_id, f"❌ run_hero_reranking failed: {e}")
182
+ raise
183
+
184
+ try:
185
+ extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)
186
+
187
+ events_num = count_total_events(extracted_event_path)
188
+
189
+ if update_fn:
190
+ update_fn(step_id, f"{events_num} events are extracted from those documents.")
191
+ step_id+=1
192
+ except Exception as e:
193
+ if update_fn:
194
+ update_fn(step_id, f"❌ Event extraction failed: {e}")
195
+ raise
196
+
197
+
198
+ print("πŸ“… Sorting events temporally ...")
199
+
200
+
201
+ sorted_events = extract_and_sort_events(
202
+ data_dir=pipeline_base_dir,
203
+ pledge_date=pledge_date,
204
+ pledge_author=pledge_author,
205
+ claim=claim,
206
+ suggestion_meta=suggestion_meta
207
+ )
208
+
209
+ df = pd.DataFrame(sorted_events)
210
+ sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
211
+ df.to_excel(sorted_event_path, index=False)
212
+ print(sorted_event_path)
213
+
214
+ if update_fn:
215
+ update_fn(step_id, "All done!")
216
+ step_id += 1
217
+
218
+ return {
219
+ "claim_json": claim_json_path,
220
+ "initial_scraped_jsonl": initial_scraped_output_path,
221
+ "initial_tsv_file": initial_tsv_file,
222
+ "hero_dir": hero_output_dir,
223
+ "augmented_scraped_jsonl": augmented_scraped_output_path,
224
+ "augmented_tsv_file": augmented_tsv_file,
225
+ "meta_data_dir": meta_data_dir,
226
+ "unsorted_events": extracted_event_path,
227
+ "sorted_events": sorted_event_path,
228
+ "step_id": step_id
229
+ }
230
+
231
+
232
+ if __name__ == "__main__":
233
+ start = time.time()
234
+
235
+ if os.environ.get("HF_TOKEN"):
236
+ login(token=os.environ["HF_TOKEN"])
237
+ else:
238
+ print("No Hugging Face token found in environment variable HF_TOKEN.")
239
+
240
+ claim = "β€œWe will support families with children by introducing free breakfast clubs in every primary school”"
241
+ start_date = "20250504"
242
+ timestamp = "xxxxx"
243
+ user_id = "xxx"
244
+
245
+ outputs = run_pipeline(claim, time_start, timestamp, user_id)
246
+ print("🎯 Pipeline finished. Outputs:", outputs)
247
+ print(f"⏱️ Total time: {time.time() - start:.2f} seconds")
system/process_time.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import datetime
3
+ import re
4
+ import pandas as pd
5
+ import os, argparse
6
+ import random
7
+ import csv
8
+ from openai import OpenAI
9
+ from huggingface_hub import hf_hub_download
10
+ import json
11
+ import os
12
+
13
+
14
+
15
+ def gpt_4o_useful(input):
16
+ client=OpenAI(api_key=os.environ.get("OAI"))
17
+ response = client.chat.completions.create(
18
+ model="gpt-4o",
19
+ messages=[
20
+ {
21
+ "role": "user",
22
+ "content": [
23
+ {
24
+ "type": "text",
25
+ "text": input
26
+ }
27
+ ]
28
+ }
29
+ ],
30
+ response_format={"type": "text"},
31
+ temperature=0.0000000001,
32
+ max_tokens=4096,
33
+ top_p=0,
34
+ frequency_penalty=0,
35
+ presence_penalty=0,
36
+ logprobs=True
37
+ )
38
+
39
+ text = response.choices[0].message.content
40
+
41
+ if response.choices[0].logprobs and response.choices[0].logprobs.content:
42
+ first_token_logprob = response.choices[0].logprobs.content[0]
43
+ token = first_token_logprob.token
44
+ logprob = first_token_logprob.logprob
45
+ else:
46
+ token = None
47
+ logprob = None
48
+
49
+ return text, token, logprob
50
+
51
+
52
+
53
+ def get_ICL(data, top_k=None):
54
+
55
+ ICL =""
56
+ if top_k == None:
57
+ data = data
58
+ else:
59
+ # print(data)
60
+ data = data[:top_k]
61
+ for line in data:
62
+ # line = json.loads(line)
63
+ pledge = line["pledge"]
64
+ event = line["event_description"]
65
+ time = line["event_date"]
66
+ input=f"Pledge: {pledge}\nEvent Summary: {event} (Event Date: {time})\nIs this event summary useful to track the fulfilment of this pledge"
67
+ input = input.strip()
68
+ output = line["label"].strip()
69
+ ICL = f"{ICL}Input: {input}\nOutput: {output}\n\n"
70
+ return ICL
71
+
72
+ def load_json(file_path):
73
+ with open(file_path, 'r', encoding='utf-8') as f:
74
+ data = json.load(f)
75
+ return data
76
+
77
+
78
+ def gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=None):
79
+
80
+ if suggestion_meta:
81
+ # print(ICL_id)
82
+
83
+ train_data = [line for line in train_data if str(line.get("pledge_id")) == str(ICL_id)]
84
+
85
+ else:
86
+ random.seed(42)
87
+ random.shuffle(train_data)
88
+
89
+ ICL = get_ICL(train_data, top_k=50)
90
+ # print(ICL)
91
+ input = f"{instruction}\nBelow are examples:\n\n{ICL}Now, please assign a label for the below instance.\nInput: {test_instance}\nOutput:"
92
+
93
+ try:
94
+ text, tokens, logprobs = gpt_4o_useful(input)
95
+ except Exception as e:
96
+ print(e)
97
+ tokens = None
98
+ logprobs = None
99
+
100
+ return tokens, logprobs
101
+
102
+ def extract_columns_to_dict(file_path, delimiter='\t'):
103
+
104
+ data_dict = {}
105
+
106
+ with open(file_path, mode='r', encoding='utf-8') as file:
107
+ reader = csv.reader(file, delimiter=delimiter)
108
+ for row in reader:
109
+ if len(row) >= 4:
110
+ key = row[2]
111
+ value = row[3]
112
+ data_dict[key] = value
113
+
114
+ return data_dict
115
+
116
+
117
+ import datetime
118
+ import re
119
+
120
+ def parse_date(date_str):
121
+ if not date_str:
122
+ return None, date_str
123
+ date_str = date_str.strip()
124
+
125
+ # Case 1: YYYY-MM-DD
126
+ try:
127
+ return datetime.datetime.strptime(date_str, "%Y-%m-%d"), date_str
128
+ except ValueError:
129
+ pass
130
+
131
+ # Case 2: Relative date
132
+ match = re.search(r'(.*) \(relative to (\d{4}-\d{2}-\d{2})\)', date_str)
133
+ if match:
134
+ reference = datetime.datetime.strptime(match.group(2), "%Y-%m-%d")
135
+ relative_term = match.group(1).strip().lower()
136
+ if relative_term == "last month":
137
+ target_date = reference - datetime.timedelta(days=30)
138
+ elif relative_term == "yesterday":
139
+ target_date = reference - datetime.timedelta(days=1)
140
+ elif relative_term == "last week":
141
+ target_date = reference - datetime.timedelta(days=7)
142
+ elif relative_term == "this week":
143
+ target_date = reference
144
+ else:
145
+ return None, date_str
146
+ return target_date, date_str
147
+
148
+ # Case 3: YYYY
149
+ match = re.fullmatch(r'(\d{4})', date_str)
150
+ if match:
151
+ year = int(match.group(1))
152
+ return datetime.datetime(year, 1, 1), date_str
153
+
154
+ # Case 4: Month YYYY
155
+ match = re.fullmatch(r'(\w+) (\d{4})', date_str)
156
+ if match:
157
+ try:
158
+ target_date = datetime.datetime.strptime(date_str, "%B %Y")
159
+ return target_date, date_str
160
+ except ValueError:
161
+ return None, date_str
162
+
163
+ # Case 5: YYYY-QX
164
+ match = re.fullmatch(r'(\d{4})-Q(\d)', date_str)
165
+ if match:
166
+ year, quarter = int(match.group(1)), int(match.group(2))
167
+ month = (quarter - 1) * 3 + 1
168
+ return datetime.datetime(year, month, 1), date_str
169
+
170
+ # Case 6: YYYY Season
171
+ match = re.fullmatch(r'(\d{4}) (Spring|Summer|Autumn|Fall|Winter)', date_str, re.IGNORECASE)
172
+ if match:
173
+ year = int(match.group(1))
174
+ season_map = {"spring": 3, "summer": 6, "autumn": 9, "fall": 9, "winter": 12}
175
+ month = season_map[match.group(2).lower()]
176
+ return datetime.datetime(year, month, 1), date_str
177
+
178
+ return None, date_str
179
+
180
+
181
+ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggestion_meta):
182
+
183
+ events = []
184
+
185
+ # url_path = os.path.join(data_dir, "augmented_search_results.tsv")
186
+ # url_query_dict = extract_columns_to_dict(file_path=url_path, delimiter='\t')
187
+
188
+ pledge = claim.strip()
189
+
190
+ file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
191
+ gpt4_results_json = load_json(file_path)
192
+
193
+ # print(gpt4_results_json)
194
+ train_file_path = hf_hub_download(
195
+ repo_id="PledgeTracker/demo_feedback",
196
+ filename="train_useful.json",
197
+ repo_type="dataset",
198
+ token=os.environ["HF_TOKEN"]
199
+ )
200
+
201
+ with open(train_file_path, "r", encoding="utf-8") as f:
202
+ train_data = json.load(f)
203
+ # print(train_data[0])
204
+
205
+
206
+
207
+ instruction_path = hf_hub_download(
208
+ repo_id="PledgeTracker/demo_feedback",
209
+ filename="instruction.txt",
210
+ repo_type="dataset",
211
+ token=os.environ["HF_TOKEN"]
212
+ )
213
+
214
+ instruction = open(instruction_path, "r").read()
215
+
216
+ map_file_path = hf_hub_download(
217
+ repo_id="PledgeTracker/demo_feedback",
218
+ filename="mapping.txt",
219
+ repo_type="dataset",
220
+ token=os.environ["HF_TOKEN"]
221
+ )
222
+ mapping_f = open(map_file_path, "r").readlines()
223
+ mapping = {}
224
+
225
+ for map_id, line in enumerate(mapping_f):
226
+ mapping[map_id] = int(line.strip())
227
+
228
+ ICL_id = None
229
+ if suggestion_meta:
230
+ try:
231
+ idx = int(suggestion_meta["index"])
232
+ ICL_id = mapping.get(idx)
233
+ print(f"[Suggestion] index: {idx} β†’ pledge_id: {ICL_id}")
234
+ except Exception as e:
235
+ print(f"[Mapping error]: {e}")
236
+
237
+ for doc in gpt4_results_json:
238
+ mete_date = doc["date"]
239
+ for event in doc.get("output", {}).get("events", []):
240
+ parsed_date, original_date = parse_date(event["date"])
241
+
242
+ if parsed_date:
243
+ parsed_date_str = parsed_date.strftime("%Y-%m-%d")
244
+ if parsed_date_str != mete_date:
245
+ event_date_and_pub_date = f"{parsed_date_str} ({mete_date})"
246
+ else:
247
+ event_date_and_pub_date = parsed_date_str
248
+
249
+ test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful to track the fulfilment of this pledge"
250
+
251
+ label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)
252
+
253
+ URL = doc["url"]
254
+ events.append({
255
+ "date": original_date,
256
+ "event date (publication date if different)": event_date_and_pub_date,
257
+ "event": event["event"],
258
+ "url": URL,
259
+ "label": label,
260
+ "confident": score,
261
+ })
262
+
263
+ events.sort(key=lambda x: parse_date(x["date"])[0], reverse=True)
264
+ return events
265
+
266
+
267
+
system/scraper.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ import os
3
+ import csv
4
+ import json
5
+ import fitz
6
+ import time
7
+ import requests
8
+ import pandas as pd
9
+ from time import sleep
10
+ from pathlib import Path
11
+ from system.html2lines import url2lines, line_correction, html2metadata
12
+
13
+ MAX_RETRIES = 3
14
+ TIMEOUT = 5 # seconds
15
+
16
+
17
+ def scrape_text_from_url(url, temp_name):
18
+ response = None
19
+ for attempt in range(MAX_RETRIES):
20
+ try:
21
+ response = requests.get(url, timeout=TIMEOUT)
22
+ break
23
+ except requests.RequestException:
24
+ if attempt < MAX_RETRIES - 1:
25
+ sleep(3)
26
+
27
+ if response is None or response.status_code == 503:
28
+ return []
29
+
30
+ if url.endswith(".pdf"):
31
+ pdf_dir = Path("/tmp/pdf_dir")
32
+ pdf_dir.mkdir(parents=True, exist_ok=True)
33
+ pdf_path = pdf_dir / f"{temp_name}.pdf"
34
+ with open(pdf_path, "wb") as f:
35
+ f.write(response.content)
36
+
37
+ extracted_text = ""
38
+ doc = fitz.open(str(pdf_path))
39
+ for page in doc:
40
+ extracted_text += page.get_text() or ""
41
+
42
+ return line_correction(extracted_text.split("\n"))
43
+
44
+ return line_correction(url2lines(url))
45
+
46
+ def process_row(row, claim_id):
47
+ try:
48
+ url = row[2]
49
+ json_data = {
50
+ "claim_id": claim_id,
51
+ "type": row[1],
52
+ "query": row[3],
53
+ "url": url,
54
+ "url2text": scrape_text_from_url(url, claim_id),
55
+ "metadata": {}
56
+ }
57
+ meta = html2metadata(url)
58
+ json_data["metadata"] = {
59
+ "title": meta.get("title"),
60
+ "date": meta.get("date")
61
+ }
62
+ return json_data
63
+ except Exception as e:
64
+ print(f"[WARN] Failed to scrape {row[2]}: {e}")
65
+ return None
66
+
67
+ def run_scraper(tsv_file_path: str, output_jsonl_path: str, max_workers: int = 10):
68
+ claim_id = Path(tsv_file_path).stem
69
+ output_jsonl_path = Path(output_jsonl_path)
70
+ output_jsonl_path.parent.mkdir(parents=True, exist_ok=True)
71
+
72
+ if output_jsonl_path.exists():
73
+ print(f"[INFO] Skipping processing as output file already exists: {output_jsonl_path}")
74
+ return str(output_jsonl_path)
75
+
76
+ try:
77
+ df = pd.read_csv(tsv_file_path, sep="\t", header=None)
78
+ print("[INFO] Data loaded successfully with Pandas.")
79
+ except Exception as e:
80
+ raise RuntimeError(f"[ERROR] Failed to load TSV: {e}")
81
+
82
+ results = []
83
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
84
+ futures = [executor.submit(process_row, row, claim_id) for _, row in df.iterrows()]
85
+ for future in as_completed(futures):
86
+ result = future.result()
87
+ if result:
88
+ results.append(result)
89
+
90
+ with open(output_jsonl_path, "w", encoding="utf-8") as json_file:
91
+ for item in results:
92
+ json_file.write(json.dumps(item, ensure_ascii=False) + "\n")
93
+
94
+ print(f"[SYSTEM] Output saved to {output_jsonl_path}")
95
+ return str(output_jsonl_path)
test.html ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>Pledge Tracker – Demo</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ </head>
9
+ <body class="bg-gray-50 text-gray-800">
10
+ <header class="bg-white shadow py-4 sticky top-0 z-10">
11
+ <div class="container mx-auto flex items-center justify-between px-4">
12
+ <div class="flex items-center gap-2">
13
+ <span class="text-2xl font-bold text-purple-600">πŸ€—</span>
14
+ <span class="font-semibold text-lg">Pledge Tracking</span>
15
+ </div>
16
+ <nav class="hidden md:flex gap-6 font-medium">
17
+ <a class="hover:text-purple-600" href="#eval-response">Track Your Pledge</a>
18
+ <a class="hover:text-purple-600" href="#about">About</a>
19
+ </nav>
20
+ </div>
21
+ </header>
22
+
23
+ <section class="py-16 bg-gradient-to-r from-purple-50 to-purple-50 text-center">
24
+ <div class="container mx-auto px-4 max-w-3xl">
25
+ <h1 class="text-3xl md:text-4xl font-extrabold mb-6">
26
+ <span style="font-variant: small-caps; font-weight: bold;">PledgeTracker</span>: A System for Monitoring the Fulfilment of Pledges
27
+ </h1>
28
+ <div class="text-lg text-gray-600 leading-relaxed space-y-4 text-justify">
29
+ <p>
30
+ <span style="font-variant: small-caps;">PledgeTracker</span> is a system to monitor the fulfilment of political pledges. As part of this study, we will collect your inputs to help evaluate and improve the system. We may also collect your feedback if you submit it via the feedback form. No personal information will be collected, and all data will be anonymised and stored securely. By using the system, you agree to participate in this study under these conditions.
31
+ </p>
32
+ <p class="text-center">
33
+ Please contact
34
+ <a href="mailto:[email protected]" class="text-purple-600 underline">Andreas Vlachos</a>
35
+ and
36
+ <a href="mailto:[email protected]" class="text-purple-600 underline">Yulong Chen</a>
37
+ if you have any concerns.
38
+ </p>
39
+ </div>
40
+ </div>
41
+ </section>
42
+
43
+ <section id="eval-response" class="py-12">
44
+ <div class="container mx-auto px-4 max-w-4xl">
45
+ <!-- <h2 class="text-2xl font-bold mb-6">Track Manifesto Pledge</h2> -->
46
+ <label for="claim" class="block text-sm font-medium mb-2">
47
+ Please enter the pledge:
48
+ </label>
49
+ <textarea
50
+ id="claim"
51
+ class="w-full border rounded-lg p-3 h-40 focus:outline-none focus:ring-2 focus:ring-purple-500"
52
+ placeholder="For example: 'We will support families with children by introducing free breakfast clubs in every primary school...'"
53
+ ></textarea>
54
+
55
+ <div id="similar-suggestions" class="mt-3 text-sm text-gray-600 hidden"></div>
56
+
57
+ <div class="mt-4">
58
+ <label for="pledge-date" class="block text-sm font-medium mb-2">
59
+ When was this pledge made?
60
+ </label>
61
+ <div class="grid grid-cols-[1fr_auto] items-center gap-2">
62
+ <input
63
+ type="date"
64
+ id="pledge-date"
65
+ class="w-full border rounded-lg p-2"
66
+ />
67
+ <button
68
+ onclick="setDefaultDate()"
69
+ type="button"
70
+ class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
71
+ >
72
+ Use default: 4th Jul 2024
73
+ </button>
74
+ </div>
75
+ <div id="date-warning" class="text-sm text-red-600 mt-1 hidden">
76
+ Please select a date or click the button to use the default.
77
+ </div>
78
+ </div>
79
+
80
+ <div class="mt-4">
81
+ <label for="pledge-author" class="block text-sm font-medium mb-2">
82
+ Who made this pledge?
83
+ </label>
84
+ <div class="grid grid-cols-[1fr_auto] items-center gap-2">
85
+ <input
86
+ type="text"
87
+ id="pledge-author"
88
+ class="w-full border rounded-lg p-2"
89
+ placeholder="Enter the name of the party or person"
90
+ />
91
+ <button
92
+ onclick="setDefaultAuthor()"
93
+ type="button"
94
+ class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
95
+ >
96
+ Use default: Labour
97
+ </button>
98
+ </div>
99
+ <div id="author-warning" class="text-sm text-red-600 mt-1 hidden">
100
+ Please enter a speaker or click the button to use the default.
101
+ </div>
102
+ </div>
103
+
104
+
105
+
106
+ <label for="time-range" class="block text-sm font-medium mt-4 mb-2">
107
+ Please select a time range:
108
+ </label>
109
+ <select id="time-range" class="w-full border rounded-lg p-2">
110
+ <option value="week">Past one week</option>
111
+ <option value="month">Past one month</option>
112
+ <!-- <option value="year">From when the pledge was made</option> -->
113
+ <option value="since_pledge_date">From when the pledge was made</option>
114
+ </select>
115
+
116
+ <button
117
+ id="check"
118
+ class="mt-4 px-6 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
119
+ >
120
+ Let's track!
121
+ </button>
122
+
123
+ <div id="progress" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
124
+ <h3 class="font-semibold mb-2">System Progress</h3>
125
+ <div id="status" class="text-sm text-gray-800 font-normal leading-relaxed"></div>
126
+ </div>
127
+
128
+
129
+ <div id="result" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
130
+ <h3 class="font-semibold mb-2">Result</h3>
131
+ <p class="text-gray-700"></p>
132
+ </div>
133
+ </div>
134
+ </section>
135
+
136
+ <section id="about" class="py-12">
137
+ <div class="container mx-auto px-4 max-w-4xl">
138
+ <h2 class="text-2xl font-bold mb-6">About</h2>
139
+ <p class="text-gray-700 leading-relaxed">
140
+ <span style="font-variant: small-caps;">PledgeTracker</span> is a research prototype developed to support the monitoring of political pledge fulfilment.
141
+ This demo is developed by researchers at the University of Cambridge, Queen Mary University London, and Full Fact.
142
+ </p>
143
+ </div>
144
+ </section>
145
+
146
+
147
+
148
+
149
+ <script>
150
+ let suggestedPledge = null;
151
+ let currentAbortController = null;
152
+ const feedbackData = {};
153
+ let lastUsedFile = null;
154
+ let lastUserId = null;
155
+ let lastTimestamp = null;
156
+ const checkBtn = document.getElementById("check");
157
+
158
+ const stepListStandard = {
159
+ 1: "Retrieving evidence related to the pledge",
160
+ 2: "Scraping documents from URLs",
161
+ 3: "Generating more queries based on the retrieved evidence",
162
+ 4: "Searching more articles",
163
+ 5: "Scraping documents from URLs",
164
+ 6: "Finding the most relevant documents",
165
+ 7: "Extracting events from top documents",
166
+ 8: "Sorting events temporally"
167
+ };
168
+
169
+ const stepListSuggestion = {
170
+ 1: "Generating queries to retrieve evidence",
171
+ 2: "Searching more articles",
172
+ 3: "Scraping documents from URLs",
173
+ 4: "Finding the most relevant documents",
174
+ 5: "Extracting events from top documents",
175
+ 6: "Sorting events temporally"
176
+ };
177
+
178
+ let stepList = stepListStandard;
179
+
180
+ function renderStatus(statusDict) {
181
+ let html = "<ul class='list-disc ml-6 space-y-1 text-sm'>";
182
+ for (let step in stepList) {
183
+ const raw = statusDict?.[step] || stepList[step];
184
+ const content = raw.replace(/\n/g, "<br>");
185
+ const prefix = statusDict?.[step] ? "βœ…" : "⏳";
186
+ html += `<li>${prefix} Step ${step}: ${content}</li>`;
187
+ }
188
+ html += "</ul>";
189
+ return html;
190
+ }
191
+
192
+ function setDefaultDate() {
193
+ const input = document.getElementById("pledge-date");
194
+ input.value = "2024-07-04";
195
+ document.getElementById("date-warning").classList.add("hidden");
196
+ }
197
+
198
+ function setDefaultAuthor() {
199
+ const input = document.getElementById("pledge-author");
200
+ input.value = "Labour";
201
+ document.getElementById("author-warning").classList.add("hidden");
202
+ }
203
+
204
+ // function setFeedback(index, answer) {
205
+ // feedbackData[index] = answer;
206
+ // const message = document.getElementById(`msg-${index}`);
207
+ // message.textContent = `βœ“ Selected: ${answer ? 'Yes' : 'No'}`;
208
+ // message.className = answer
209
+ // ? "text-sm text-green-600 mt-1"
210
+ // : "text-sm text-red-600 mt-1";
211
+ // }
212
+ function setFeedback(index, answer) {
213
+ feedbackData[index] = answer;
214
+ const message = document.getElementById(`msg-${index}`);
215
+
216
+ let displayText = "";
217
+ let colorClass = "";
218
+
219
+ switch(answer) {
220
+ case "not_relevant":
221
+ displayText = "Not relevant";
222
+ colorClass = "text-red-300";
223
+ break;
224
+ case "relevant_seen":
225
+ displayText = "Relevant but already seen";
226
+ colorClass = "text-grey-400";
227
+ break;
228
+ case "relevant_updated":
229
+ displayText = "Relevant and up-to-date";
230
+ colorClass = "text-blue-400";
231
+ break;
232
+ }
233
+
234
+ message.textContent = `βœ“ Selected: ${displayText}`;
235
+ message.className = `text-sm ${colorClass} mt-1`;
236
+ }
237
+
238
+ function pollStatus(userId, timestamp, statusElement) {
239
+ if (window.pollIntervalId) {
240
+ clearInterval(window.pollIntervalId);
241
+ }
242
+
243
+ window.pollIntervalId = setInterval(async () => {
244
+ try {
245
+ const res = await fetch(`/api/status?user_id=${userId}&timestamp=${timestamp}&_=${Date.now()}`);
246
+ const data = await res.json();
247
+
248
+ if (data.status) {
249
+ statusElement.innerHTML = renderStatus(data.status);
250
+ }
251
+
252
+ const values = Object.values(data.status || {});
253
+ const finalText = values.join(" ").toLowerCase();
254
+
255
+ if (finalText.includes("done") || finalText.includes("finished")) {
256
+ clearInterval(window.pollIntervalId);
257
+ window.pollIntervalId = null;
258
+ statusElement.innerHTML += `<div class="mt-2 text-green-600 font-semibold">βœ… All done.</div>`;
259
+ checkBtn.disabled = false;
260
+ checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
261
+
262
+ suggestedPledge = null;
263
+
264
+ const waitForFile = setInterval(() => {
265
+ if (lastUsedFile) {
266
+ clearInterval(waitForFile);
267
+ loadEvents(lastUsedFile);
268
+ }
269
+ }, 200);
270
+ } else if (Object.values(data.status || {}).some(v => v.startsWith("❌"))) {
271
+ clearInterval(window.pollIntervalId);
272
+ window.pollIntervalId = null;
273
+ statusElement.innerHTML += `<div class="mt-2 text-red-600 font-semibold">❌ The process failed.</div>`;
274
+ checkBtn.disabled = false;
275
+ checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
276
+ }
277
+ } catch (err) {
278
+ clearInterval(window.pollIntervalId);
279
+ window.pollIntervalId = null;
280
+ statusElement.innerHTML = `<div class="text-red-600">❌ Failed to check status: ${err.message}</div>`;
281
+ }
282
+ }, 2000);
283
+ }
284
+
285
+
286
+
287
+ async function submitAllFeedback() {
288
+ const entries = Object.entries(feedbackData);
289
+ if (entries.length === 0) {
290
+ alert("No feedback to submit!");
291
+ return;
292
+ }
293
+ const confirmed = confirm("By submitting feedback, you agree that your feedback may be collected for our analysis. Your data will be anonymised and stored securely. No personal information will be recorded. If you do not wish to take part, please cancel this submission.");
294
+ if (!confirmed) return;
295
+
296
+ const pledgeText = document.getElementById("claim").value.trim();
297
+
298
+ const res = await fetch('/api/feedback', {
299
+ method: 'POST',
300
+ headers: { 'Content-Type': 'application/json' },
301
+ body: JSON.stringify({
302
+ pledge: pledgeText,
303
+ file: lastUsedFile,
304
+ user_id: lastUserId,
305
+ timestamp: lastTimestamp,
306
+ feedback: entries.map(([index, answer]) => ({
307
+ eventIndex: index,
308
+ answer: answer
309
+ }))
310
+ })
311
+ });
312
+
313
+ alert(res.ok ? "βœ… Feedback submitted successfully!" : "❌ Submission failed.");
314
+ }
315
+
316
+ async function loadEvents(file) {
317
+ const resultBox = document.getElementById("result");
318
+ const p = resultBox.querySelector("p");
319
+ resultBox.classList.remove("hidden");
320
+
321
+ try {
322
+ const fileParam = encodeURIComponent(file);
323
+ const eventsRes = await fetch(`/api/events?file=${fileParam}`);
324
+ if (!eventsRes.ok) throw new Error("❌ Event file not found or malformed");
325
+ const data = await eventsRes.json();
326
+ if (!Array.isArray(data)) throw new Error("❌ Unexpected data format");
327
+
328
+ if (data.length === 0) {
329
+ p.innerHTML = `<div class="text-gray-500 italic"> Sorry, we do not find any progress for this pledge.</div>`;
330
+ return;
331
+ }
332
+ // p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
333
+ // data.map((e, index) => `
334
+ p.innerHTML =
335
+ data.map((e, index) => `
336
+ <div class="mb-6 border-b pb-4">
337
+ πŸ—“οΈ <b>${e.date}</b>: ${e.event}<br>
338
+ πŸ”— <a href="${e.url}" target="_blank" class="text-purple-400 underline">Source</a>
339
+
340
+ <div class="mt-3">
341
+ <label class="block text-sm font-medium mb-2">How relevant is this event?</label>
342
+ <div class="flex flex-wrap gap-2">
343
+ <button onclick="setFeedback(${index}, 'not_relevant')"
344
+ class="px-3 py-1.5 bg-gray-100 hover:bg-gray-200 border border-gray-300 rounded-lg text-gray-700">
345
+ Not relevant
346
+ </button>
347
+ <button onclick="setFeedback(${index}, 'relevant_seen')"
348
+ class="px-3 py-1.5 bg-blue-100 hover:bg-blue-200 border border-blue-300 rounded-lg text-blue-700">
349
+ Relevant but seen
350
+ </button>
351
+ <button onclick="setFeedback(${index}, 'relevant_updated')"
352
+ class="px-3 py-1.5 bg-green-100 hover:bg-green-200 border border-green-300 rounded-lg text-green-700">
353
+ Relevant & up-to-date
354
+ </button>
355
+ </div>
356
+ <div id="msg-${index}" class="text-sm mt-1"></div>
357
+ </div>
358
+ </div>
359
+ `).join('') +
360
+ `<button onclick="submitAllFeedback()" class="mt-6 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
361
+ πŸ“€ Submit All Feedback
362
+ </button>
363
+ <button onclick="window.location.href='/download?file=${fileParam}'" class="mt-4 ml-4 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
364
+ πŸ“… Download Excel
365
+ </button>`;
366
+ } catch (err) {
367
+ p.textContent = `❌ Failed to load timeline: ${err.message}`;
368
+ }
369
+ }
370
+
371
+ let suggestTimer = null;
372
+ document.getElementById("claim").addEventListener("input", () => {
373
+ suggestedPledge = null;
374
+ clearTimeout(suggestTimer);
375
+ suggestTimer = setTimeout(fetchSuggestions, 300); // 300ms delay to avoid flooding
376
+ });
377
+
378
+ async function fetchSuggestions() {
379
+ const claimText = document.getElementById("claim").value.trim();
380
+ const suggestionBox = document.getElementById("similar-suggestions");
381
+
382
+ if (!claimText) {
383
+ suggestionBox.classList.add("hidden");
384
+ return;
385
+ }
386
+
387
+ const res = await fetch("/api/similar-pledges", {
388
+ method: "POST",
389
+ headers: { "Content-Type": "application/json" },
390
+ body: JSON.stringify({ claim: claimText })
391
+ });
392
+ const data = await res.json();
393
+ const suggestions = data.suggestions || [];
394
+
395
+ if (suggestions.length === 0) {
396
+ suggestionBox.classList.add("hidden");
397
+ } else {
398
+ const author = "Labour";
399
+ const date = "2024-07-04";
400
+ suggestionBox.innerHTML =
401
+ "<div class='font-semibold mb-1'>πŸ’‘ Are you fact-checking this pledge? </div>" +
402
+ "<ul class='list-disc ml-6 mt-1'>" +
403
+ suggestions.map(s => `
404
+ <li class="mb-2">
405
+ ${author}: ${s.text} (${date})
406
+ <button
407
+ onclick="useSuggestedPledge('${s.text.replace(/'/g, "\\'")}', ${s.index})"
408
+ class="ml-2 px-2 py-1 text-xs bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500">
409
+ Fact-check this pledge
410
+ </button>
411
+ </li>
412
+ `).join("") +
413
+ "</ul>";
414
+ suggestionBox.classList.remove("hidden");
415
+ }
416
+ }
417
+
418
+
419
+ checkBtn.addEventListener("click", async () => {
420
+ const claim = document.getElementById("claim").value.trim();
421
+ const pledgeDate = document.getElementById("pledge-date").value.trim();
422
+ const pledgeAuthor = document.getElementById("pledge-author").value.trim();
423
+ const statusElement = document.getElementById("status");
424
+ const resultBox = document.getElementById("result");
425
+ // resultBox.classList.remove("hidden");
426
+ const p = resultBox.querySelector("p");
427
+
428
+
429
+
430
+ let valid = true;
431
+ if (!claim) {
432
+ alert("Please enter the pledge text.");
433
+ valid = false;
434
+ }
435
+ if (!pledgeDate) {
436
+ document.getElementById("date-warning").classList.remove("hidden");
437
+ valid = false;
438
+ }
439
+ if (!pledgeAuthor) {
440
+ document.getElementById("author-warning").classList.remove("hidden");
441
+ valid = false;
442
+ }
443
+
444
+ if (!valid) return;
445
+
446
+ checkBtn.disabled = true;
447
+ checkBtn.classList.add("opacity-50", "cursor-not-allowed");
448
+
449
+ // document.getElementById("status").classList.remove("hidden");
450
+ // statusElement.innerHTML = renderStatus({});
451
+ // document.getElementById("result").classList.remove("hidden");
452
+ // document.getElementById("progress").classList.remove("hidden");
453
+
454
+ document.getElementById("status").innerHTML = "";
455
+ document.getElementById("result").classList.add("hidden");
456
+ document.getElementById("progress").classList.add("hidden");
457
+ document.getElementById("result").querySelector("p").innerHTML = "";
458
+ if (window.pollIntervalId) {
459
+ clearInterval(window.pollIntervalId);
460
+ window.pollIntervalId = null;
461
+ }
462
+ Object.keys(feedbackData).forEach(key => delete feedbackData[key]);
463
+ lastUsedFile = null;
464
+ lastUserId = null;
465
+ lastTimestamp = null;
466
+
467
+ // πŸ”„ 可δ»₯ι’„ε…ˆζ˜Ύη€Ίζη€Ί
468
+ document.getElementById("result").querySelector("p").textContent = "⏳ Please wait, checking...";
469
+ document.getElementById("progress").classList.remove("hidden");
470
+ document.getElementById("result").classList.remove("hidden");
471
+
472
+
473
+ try {
474
+ const timeRange = document.getElementById("time-range").value;
475
+ // const pledgeDate = document.getElementById("pledge-date").value;
476
+ // const pledgeAuthor = document.getElementById("pledge-author").value;
477
+ if (currentAbortController) currentAbortController.abort();
478
+ currentAbortController = new AbortController();
479
+ const signal = currentAbortController.signal;
480
+ let valid = true;
481
+
482
+ stepList = (suggestedPledge !== null) ? stepListSuggestion : stepListStandard;
483
+
484
+ if (!pledgeDate) {
485
+ document.getElementById("date-warning").classList.remove("hidden");
486
+ valid = false;
487
+ }
488
+ if (!pledgeAuthor) {
489
+ document.getElementById("author-warning").classList.remove("hidden");
490
+ valid = false;
491
+ }
492
+ if (!valid) return;
493
+
494
+ const userId = Math.random().toString(36).substring(2, 10);
495
+ const now = new Date();
496
+ const timestamp = now.toISOString().replace(/[:.]/g, "-").slice(0, 19);
497
+ statusElement.textContent = "";
498
+ // pollStatus(userId, timestamp, p);
499
+ pollStatus(userId, timestamp, document.getElementById("status"));
500
+
501
+
502
+ const runRes = await fetch("/api/run-model", {
503
+ method: "POST",
504
+ headers: { "Content-Type": "application/json" },
505
+ body: JSON.stringify({
506
+ claim,
507
+ time_range: timeRange,
508
+ pledge_date: pledgeDate,
509
+ pledge_author: pledgeAuthor,
510
+ user_id: userId,
511
+ timestamp: timestamp,
512
+ signal: signal,
513
+ suggestion_meta: suggestedPledge
514
+ })
515
+ });
516
+
517
+ const runData = await runRes.json();
518
+
519
+ lastUsedFile = runData.file;
520
+ lastUserId = runData.user_id;
521
+ lastTimestamp = runData.timestamp;
522
+ } catch (err) {
523
+ if (err.name === "AbortError") {
524
+ console.log("Previous request aborted.");
525
+ checkBtn.disabled = false;
526
+ checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
527
+ return;
528
+ }
529
+ p.textContent = `❌ Failed to load timeline: ${err.message}`;
530
+ }
531
+
532
+ });
533
+
534
+
535
+ async function useSuggestedPledge(text, index) {
536
+ document.getElementById("claim").value = text;
537
+ document.getElementById("pledge-author").value = "Labour";
538
+ document.getElementById("pledge-date").value = "2024-07-04";
539
+ suggestedPledge = { text, index };
540
+ alert("βœ… This pledge has been filled in. You can now click 'Let's track!'");
541
+ await fetch("/api/log-similar-selection", {
542
+ method: "POST",
543
+ headers: { "Content-Type": "application/json" },
544
+ body: JSON.stringify({
545
+ selected_text: text,
546
+ index: index
547
+ })
548
+ });
549
+ }
550
+
551
+ </script>
552
+ </body>
553
+ </html>