Spaces:

mohbay
/

searchcsv2

Running

File size: 10,189 Bytes

1641ca7
 
 
 
762dded
2f4967b
 
1641ca7
2f4967b
6acd5d2
 
2f4967b
 
1641ca7
 
b6b04c7
 
2f4967b
6acd5d2
 
 
89f676e
6acd5d2
 
 
89f676e
2f4967b
30cf47b
 
 
 
2e553d1
72b5d84
762dded
6dce45a
 
 
 
 
 
762dded
8a11400
 
6dce45a
61a6c42
6dce45a
 
 
 
2f4967b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762dded
61a6c42
2f4967b
762dded
892da5a
 
 
762dded
 
 
892da5a
 
 
 
2f4967b
892da5a
 
 
 
61a6c42
892da5a
 
61a6c42
 
 
892da5a
762dded
 
2f4967b
 
 
 
 
 
 
6acd5d2
2f4967b
9e4540a
9e4b885
23cd5e3
2f4967b
6acd5d2
 
23cd5e3
2f4967b
23cd5e3
 
 
 
 
 
 
2f4967b
 
 
 
 
 
61a6c42
 
 
892da5a
2f4967b
 
 
 
 
 
 
 
 
 
 
 
 
 
892da5a
2f4967b
 
 
 
 
 
 
 
 
 
 
 
892da5a
2f4967b
 
 
 
 
 
 
61a6c42
2f4967b
 
 
 
61a6c42
2f4967b
 
 
 
61a6c42
2f4967b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61a6c42
2f4967b
 
 
 
 
 
 
 
892da5a
2f4967b
 
 
 
 
 
 
 
 
 
f46813a
2f4967b
f46813a
2f4967b
 
f46813a
2f4967b
 
 
 
 
 
f46813a
 
2f4967b
 
f46813a
2f4967b
 
f46813a
 
2f4967b
f46813a
 
 
 
2f4967b
 
 
f46813a
 
 
2f4967b
f46813a
2f4967b
 
 
 
762dded
c96f08a
61a6c42
23cd5e3
6c11a17
61a6c42
2f4967b
 
 
 
 
 
 
 
c96f08a
762dded
c96f08a
0430419
2f4967b
0430419
762dded
2f4967b
eacf3db
9e4540a
2f4967b
0430419
2f4967b

import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import gradio as gr
import re
from rank_bm25 import BM25Okapi
import numpy as np

# Load models
model = SentenceTransformer("distilbert-base-multilingual-cased")
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Load data
df = pd.read_csv("cleaned1.csv")
df2 = pd.read_csv("cleaned2.csv")
df3 = pd.read_csv("cleaned3.csv")

# Load pre-computed embeddings
embeddings = torch.load("embeddings1_1.pt")
embeddings2 = torch.load("embeddings2_1.pt")
embeddings3 = torch.load("embeddings3_1.pt")

embeddingsa = torch.load("embeddings1.pt")
embeddingsa2 = torch.load("embeddings2.pt")
embeddingsa3 = torch.load("embeddings3.pt")

# Extract questions and links
df_questions = df["question"].values
df_links = df["link"].values
df2_questions = df2["question"].values
df2_links = df2["link"].values
df3_questions = df3["question"].values
df3_links = df3["url"].values

ARABIC_STOPWORDS = {
    'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
    'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
    'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
}

def arabic_word_tokenize(text):
    if not isinstance(text, str):
        return []
    # Remove diacritics
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    # Extract only Arabic words (length ≥ 2)
    tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
    return [t for t in tokens if t not in ARABIC_STOPWORDS]

def prepare_bm25_corpus(questions):
    """Prepare tokenized corpus for BM25"""
    tokenized_corpus = []
    for question in questions:
        tokens = arabic_word_tokenize(question)
        tokenized_corpus.append(tokens)
    return tokenized_corpus

# Initialize BM25 models for each dataset
print("Initializing BM25 models...")
bm25_corpus1 = prepare_bm25_corpus(df_questions)
bm25_corpus2 = prepare_bm25_corpus(df2_questions)
bm25_corpus3 = prepare_bm25_corpus(df3_questions)

bm25_model1 = BM25Okapi(bm25_corpus1)
bm25_model2 = BM25Okapi(bm25_corpus2)
bm25_model3 = BM25Okapi(bm25_corpus3)
print("BM25 models initialized!")

def compute_bm25_scores(query, bm25_model):
    """Compute BM25 scores for a query"""
    query_tokens = arabic_word_tokenize(query)
    if not query_tokens:
        return np.zeros(len(bm25_model.corpus))
    
    scores = bm25_model.get_scores(query_tokens)
    return scores

def compute_word_overlap(query, questions):
    """Enhanced word overlap computation"""
    query_words = set(arabic_word_tokenize(query))
    if len(query_words) == 0:
        return [0.0] * len(questions)
    
    overlaps = []
    for q in questions:
        q_words = set(arabic_word_tokenize(q))
        if len(q_words) == 0:
            overlaps.append(0.0)
            continue
            
        # Use Jaccard similarity (intersection over union)
        intersection = len(query_words & q_words)
        union = len(query_words | q_words)
        jaccard = intersection / union if union > 0 else 0.0
        
        # Also compute coverage (how much of query is matched)
        coverage = intersection / len(query_words)
        
        # Combine both: prioritize coverage but consider similarity
        overlap_score = 0.7 * coverage + 0.3 * jaccard
        overlaps.append(overlap_score)
    
    return overlaps

def normalize_scores(scores):
    """Normalize scores to 0-1 range"""
    scores = np.array(scores)
    if np.max(scores) == np.min(scores):
        return np.zeros_like(scores)
    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

def predict(text):
    print(f"Received query: {text}")
    if not text or text.strip() == "":
        return "No query provided"

    # Semantic similarity scores
    query_embedding = model.encode(text, convert_to_tensor=True)
    query_embeddinga = modela.encode(text, convert_to_tensor=True)

    # Cosine similarities (averaged from two models)
    sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
    sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
    sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2

    # BM25 scores
    bm25_scores1 = compute_bm25_scores(text, bm25_model1)
    bm25_scores2 = compute_bm25_scores(text, bm25_model2)
    bm25_scores3 = compute_bm25_scores(text, bm25_model3)

    # Word overlap scores
    word_overlap1 = compute_word_overlap(text, df_questions)
    word_overlap2 = compute_word_overlap(text, df2_questions)
    word_overlap3 = compute_word_overlap(text, df3_questions)

    # Normalize all scores for fair combination
    norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
    norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
    norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
    
    norm_bm25_1 = normalize_scores(bm25_scores1)
    norm_bm25_2 = normalize_scores(bm25_scores2)
    norm_bm25_3 = normalize_scores(bm25_scores3)
    
    norm_word1 = normalize_scores(word_overlap1)
    norm_word2 = normalize_scores(word_overlap2)
    norm_word3 = normalize_scores(word_overlap3)

    # Adaptive weighting based on query characteristics
    query_words = arabic_word_tokenize(text)
    query_length = len(query_words)
    
    if query_length <= 2:
        # Short queries: prioritize exact matches (BM25 + word overlap)
        semantic_weight = 0.3
        bm25_weight = 0.4
        word_weight = 0.3
    elif query_length <= 5:
        # Medium queries: balanced approach
        semantic_weight = 0.4
        bm25_weight = 0.35
        word_weight = 0.25
    else:
        # Long queries: prioritize semantic understanding
        semantic_weight = 0.5
        bm25_weight = 0.3
        word_weight = 0.2

    def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
        combined_results = []
        
        for i in range(len(questions)):
            semantic_score = float(norm_semantic[i])
            bm25_score = float(norm_bm25[i])
            word_score = float(norm_word[i])
            
            # Enhanced scoring with BM25
            combined_score = (semantic_weight * semantic_score + 
                            bm25_weight * bm25_score + 
                            word_weight * word_score)
            
            # Boost results that perform well across multiple metrics
            high_performance_count = sum([
                semantic_score > 0.7,
                bm25_score > 0.7,
                word_score > 0.5
            ])
            
            if high_performance_count >= 2:
                boost = 0.1
            elif high_performance_count >= 1:
                boost = 0.05
            else:
                boost = 0.0
                
            final_score = combined_score + boost
            
            combined_results.append({
                "question": questions[i],
                "link": links[i],
                "semantic_score": semantic_score,
                "bm25_score": bm25_score,
                "word_overlap_score": word_score,
                "combined_score": final_score
            })
        
        return combined_results

    # Create combined results for all datasets
    combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
    combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
    combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)

    def get_diverse_top_results(combined_results, top_k=5):
        """Get diverse top results using multiple ranking strategies"""
        # Sort by combined score and get top candidates
        by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
        top_combined = by_combined[:3]
        
        # Get questions from top combined to avoid duplicates
        used_questions = {item["question"] for item in top_combined}
        
        # Add best BM25 result not already included
        by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
        bm25_pick = None
        for item in by_bm25:
            if item["question"] not in used_questions:
                bm25_pick = item
                break
        
        # Add best semantic result not already included
        by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
        semantic_pick = None
        if bm25_pick:
            used_questions.add(bm25_pick["question"])
        
        for item in by_semantic:
            if item["question"] not in used_questions:
                semantic_pick = item
                break
        
        # Combine results
        final_results = top_combined.copy()
        if bm25_pick:
            final_results.append(bm25_pick)
        if semantic_pick:
            final_results.append(semantic_pick)
        
        return final_results[:top_k]
    
    # Get top results for each dataset
    top1 = get_diverse_top_results(combined1)
    top2 = get_diverse_top_results(combined2)
    top3 = get_diverse_top_results(combined3)

    results = {
        
        "top2": top2,
        "top3": top3,
        "top1": top1,
        "query_info": {
            "query_length": query_length,
            "weights": {
                "semantic": semantic_weight,
                "bm25": bm25_weight,
                "word_overlap": word_weight
            }
        }
    }

    return results

title = "Enhanced Search with BM25"
iface = gr.Interface(
    fn=predict,
    inputs=[gr.Textbox(label="Search Query", lines=3)],
    outputs='json',
    title=title,
    description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
)

if __name__ == "__main__":
    iface.launch()