Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 1

Commit

23cd5e3

verified ·

1 Parent(s): 8a11400

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -93

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ embeddingsa = torch.load("embeddings1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
-# Pre-extract DataFrame columns to avoid repeated iloc calls
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
@@ -31,7 +30,6 @@ def arabic_word_tokenize(text):
         return []
     return re.findall(r'\w+', text)
 def compute_word_overlap(query, questions):
     query_words = set(arabic_word_tokenize(query))
     overlaps = []
@@ -47,115 +45,68 @@ def compute_word_overlap(query, questions):
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
-    all_sim_scores1 = []
-    all_sim_scores2 = []
-    all_sim_scores3 = []
-    # Compute similarity scores
-    sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
-    sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
-    sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
-    all_sim_scores1.append(sim_scores1)
-    all_sim_scores2.append(sim_scores2)
-    all_sim_scores3.append(sim_scores3)
-    sim_scores1a = util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]
-    sim_scores2a = util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]
-    sim_scores3a = util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]
-    all_sim_scores1.append(sim_scores1a)
-    all_sim_scores2.append(sim_scores2a)
-    all_sim_scores3.append(sim_scores3a)
-    sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
-    sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
-    sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
-    # Compute word overlap scores
     word_overlap1 = compute_word_overlap(text, df_questions)
     word_overlap2 = compute_word_overlap(text, df2_questions)
     word_overlap3 = compute_word_overlap(text, df3_questions)
-    # Prepare combined results list
-    weight = 0.5  # word overlap weight
-    combined_results = []
-    for i, score in enumerate(sim_scores1.cpu().numpy()):
-        combined_score = float(score) + weight * word_overlap1[i]
-        combined_results.append({
             "question": df_questions[i],
             "link": df_links[i],
-            "cosine_score": float(score),
             "word_overlap_score": float(word_overlap1[i]),
-            "combined_score": combined_score
-        })
-    for i, score in enumerate(sim_scores2.cpu().numpy()):
-        combined_score = float(score) + weight * word_overlap2[i]
-        combined_results.append({
             "question": df2_questions[i],
             "link": df2_links[i],
-            "cosine_score": float(score),
             "word_overlap_score": float(word_overlap2[i]),
-            "combined_score": combined_score
-        })
-    for i, score in enumerate(sim_scores3.cpu().numpy()):
-        combined_score = float(score) + weight * word_overlap3[i]
-        combined_results.append({
             "question": df3_questions[i],
             "link": df3_links[i],
-            "cosine_score": float(score),
             "word_overlap_score": float(word_overlap3[i]),
-            "combined_score": combined_score
-        })
-    # Get top 3 combined
-    top3_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)[:3]
-    # Also keep your original top1/top2/top3 as is
-    top3_scores1, top3_idx1 = sim_scores1.topk(3)
-    top3_scores2, top3_idx2 = sim_scores2.topk(3)
-    top3_scores3, top3_idx3 = sim_scores3.topk(3)
-    top3_idx1_cpu = top3_idx1.cpu().numpy()
-    top3_idx2_cpu = top3_idx2.cpu().numpy()
-    top3_idx3_cpu = top3_idx3.cpu().numpy()
-    top3_scores1_cpu = top3_scores1.cpu().numpy()
-    top3_scores2_cpu = top3_scores2.cpu().numpy()
-    top3_scores3_cpu = top3_scores3.cpu().numpy()
     results = {
-        "top1": [
-            {
-                "question": df_questions[idx],
-                "link": df_links[idx],
-                "score": float(score)
-            }
-            for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
-        ],
-        "top2": [
-            {
-                "question": df2_questions[idx],
-                "link": df2_links[idx],
-                "score": float(score)
-            }
-            for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
-        ],
-        "top3": [
-            {
-                "question": df3_questions[idx],
-                "link": df3_links[idx],
-                "score": float(score)
-            }
-            for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
-        ],
-        "top3_combined": top3_combined
     }
     return results

 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
         return []
     return re.findall(r'\w+', text)
 def compute_word_overlap(query, questions):
     query_words = set(arabic_word_tokenize(query))
     overlaps = []
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
+    # Cosine similarities
+    sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
+                   util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
+    sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
+                   util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
+    sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
+                   util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
+    # Word overlaps
     word_overlap1 = compute_word_overlap(text, df_questions)
     word_overlap2 = compute_word_overlap(text, df2_questions)
     word_overlap3 = compute_word_overlap(text, df3_questions)
+    weight = 0.4
+    # Collect top1
+    combined1 = [
+        {
             "question": df_questions[i],
             "link": df_links[i],
+            "cosine_score": float(sim_scores1[i].cpu().item()),
             "word_overlap_score": float(word_overlap1[i]),
+            "combined_score": float(sim_scores1[i].cpu().item()) + weight * word_overlap1[i]
+        }
+        for i in range(len(df_questions))
+    ]
+    top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:3]
+    # Collect top2
+    combined2 = [
+        {
             "question": df2_questions[i],
             "link": df2_links[i],
+            "cosine_score": float(sim_scores2[i].cpu().item()),
             "word_overlap_score": float(word_overlap2[i]),
+            "combined_score": float(sim_scores2[i].cpu().item()) + weight * word_overlap2[i]
+        }
+        for i in range(len(df2_questions))
+    ]
+    top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:3]
+    # Collect top3
+    combined3 = [
+        {
             "question": df3_questions[i],
             "link": df3_links[i],
+            "cosine_score": float(sim_scores3[i].cpu().item()),
             "word_overlap_score": float(word_overlap3[i]),
+            "combined_score": float(sim_scores3[i].cpu().item()) + weight * word_overlap3[i]
+        }
+        for i in range(len(df3_questions))
+    ]
+    top3 = sorted(combined3, key=lambda x: x["combined_score"], reverse=True)[:3]
     results = {
+        "top1": top1,
+        "top2": top2,
+        "top3": top3
     }
     return results