Spaces:

Daksh0505
/

Youtube-Chatbot

Running

File size: 9,415 Bytes

1ab0e96
 
 
6edb192
1ab0e96
cac610a
1ab0e96
 
5e7cee3
1ab0e96
cac610a
 
5e7cee3
 
70402ce
5e7cee3
1ab0e96
5e7cee3
 
 
 
 
 
 
 
1ab0e96
5e7cee3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0df4a5
5e7cee3
 
 
 
 
 
 
 
 
 
 
 
 
cac610a
5e7cee3
 
 
 
 
 
 
cac610a
5e7cee3
 
 
 
cac610a
5e7cee3
 
 
 
 
cac610a
5e7cee3
1ab0e96
 
16e31d5
aa7e489
 
 
 
 
 
 
 
 
7aaca4e
63a076a
 
 
aa7e489
63a076a
2caa894
2b0fa65
2caa894
c316e12
0799950
7406deb
 
 
0799950
63a076a
8708c1c
 
 
 
 
 
 
 
 
 
7406deb
 
 
2b0fa65
 
 
63a076a
 
8708c1c
2b0fa65
7406deb
 
2b0fa65
 
 
63a076a
 
1ab0e96
63a076a
 
 
1ab0e96
 
44957d3
 
 
 
 
 
 
 
 
1ab0e96
 
 
 
63a076a
 
 
 
731300a
 
 
 
 
 
bab9b92
731300a
63a076a
70402ce
731300a
 
5e7cee3
2caa894
5e7cee3
cac610a
5e7cee3
16e31d5
70402ce
5e7cee3
 
63a076a
5e7cee3
 
 
 
70402ce
 
63a076a
 
70402ce
63a076a
 
 
5e7cee3
cac610a
63a076a
1ab0e96
5e7cee3
 
63a076a
1ab0e96
63a076a
1ab0e96
63a076a
 
 
5e7cee3
 
 
 
e0df4a5
 
63a076a
a083328
63a076a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94fc749

import streamlit as st
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
import requests
import os

# Environment variables
api_key = os.getenv("HF_API_KEY")
RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()

if not RAPIDAPI_KEY:
    st.error("RAPIDAPI_KEY not set")

# Check available languages via RapidAPI
@st.cache_data
def get_available_languages(video_id):
    """Check available transcript languages for a video via RapidAPI"""
    url = "https://youtube-transcript3.p.rapidapi.com/api/languages"
    querystring = {"videoId": video_id}
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
    }
    try:
        response = requests.get(url, headers=headers, params=querystring, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get("success") and "languages" in data:
                languages = []
                for lang in data["languages"]:
                    code = lang.get("code", "")
                    name = lang.get("name", "")
                    languages.append((code, f"{name} ({code})"))
                return languages
        # Fallback to common languages if API fails
        return [
            ("en", "English (en)"),
            ("hi", "Hindi (hi)"),
            ("es", "Spanish (es)"),
            ("fr", "French (fr)"),
            ("de", "German (de)"),
            ("ja", "Japanese (ja)"),
            ("pt", "Portuguese (pt)"),
            ("ru", "Russian (ru)")
        ]
    except Exception as e:
        st.warning(f"Could not fetch languages: {e}. Using common languages.")
        return [
            ("en", "English (en)"),
            ("hi", "Hindi (hi)"),
            ("es", "Spanish (es)"),
            ("fr", "French (fr)"),
            ("de", "German (de)"),
            ("ja", "Japanese (ja)"),
            ("pt", "Portuguese (pt)"),
            ("ru", "Russian (ru)")
        ]

# Transcript Fetcher
@st.cache_data
def get_transcript(video_id, language_code="en"):
    url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
    querystring = {"videoId": video_id, "lang": language_code}
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
    }
    try:
        response = requests.get(url, headers=headers, params=querystring, timeout=10)
        if response.status_code != 200:
            st.error(f"API Error: {response.status_code}")
            return None
        data = response.json()
        if data.get("success") and "transcript" in data:
            return ' '.join([item.get('text', '') for item in data["transcript"]])
        else:
            st.warning("Unexpected API response format")
            return None
    except Exception as e:
        st.error(f"Error: {str(e)}")
        return None

# Vector Store
@st.cache_data
def create_vector_store(transcript):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = splitter.create_documents([transcript])
    embeddings = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-base",
        model_kwargs={"device": "cpu"}
    )
    return FAISS.from_documents(docs, embeddings)

# -------------------------------------------------
# 3️⃣ Model Builder
# -------------------------------------------------
def build_model(model_choice, temperature=0.7):
    """Return the correct model and a flag indicating if it’s chat-based."""
    if model_choice == "Llama-3.2-1B":
        llm = HuggingFaceEndpoint(
            repo_id="meta-llama/Llama-3.2-1B-Instruct",
            huggingfacehub_api_token=api_key,
            task="text-generation",
            max_new_tokens=500,
            temperature=temperature
        )
        return ChatHuggingFace(llm=llm, temperature=temperature), True  # (model, is_chat)

    elif model_choice == "Gemma-2-3B":
        llm = HuggingFaceEndpoint(
            repo_id="google/gemma-2-2b-it",
            huggingfacehub_api_token=api_key,
            task="text-generation",
            max_new_tokens=500
        )
        return ChatHuggingFace(llm=llm, temperature=temperature), True

    elif model_choice == "DeepSeek-685B":
        llm = HuggingFaceEndpoint(
            repo_id="deepseek-ai/DeepSeek-V3.2-Exp",
            huggingfacehub_api_token=api_key,
            task="text-generation",
            max_new_tokens=500
        )
        return ChatHuggingFace(llm=llm, temperature=temperature), True

    elif model_choice == "OpenAI-20B":
        llm = HuggingFaceEndpoint(
            repo_id="openai/gpt-oss-20b",
            huggingfacehub_api_token=api_key,
            task="text-generation",
            max_new_tokens=500
        )
        return ChatHuggingFace(llm=llm, temperature=temperature), True


# -------------------------------------------------
# 4️⃣ Prompt Template
# -------------------------------------------------
prompt_template = PromptTemplate(
    template=(
        "You are a helpful assistant.\n\n"
        "Answer the question using the context provided below.\n"
        "If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
        "Then, based on your own knowledge, try to answer the question.\n"
        "If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
        "Keep the answer format neat, clean, and human-readable.\n\n"
        "Context:\n{context}\n\n"
        "Question:\n{question}"

    ),
    input_variables=["context", "question"]
)


# -------------------------------------------------
# 5️⃣ Streamlit App UI
# -------------------------------------------------
import re

def extract_video_id(url: str) -> str:
    # Handles both youtube.com and youtu.be formats
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, url)
    return match.group(1) if match else None
    
st.title("🎬 YouTube Transcript Chatbot (RAG)")

video_url = st.text_input("Enter YouTube Video URL", value="lv1_-RER4_I")
video_id = extract_video_id(video_url)
query = st.text_area("Your Query", value="What is RAG?")
model_choice = st.radio("Model to Use", ["Llama-3.2-1B", "Gemma-2-3B", "DeepSeek-685B", "OpenAI-20B"])
temperature = st.slider("Temperature", 0, 100, value=50) / 100.0

# Get available languages for this video
language_code = None
if video_id:
    with st.spinner("Checking available languages..."):
        available_languages = get_available_languages(video_id)

    if available_languages:
        st.success(f"Found {len(available_languages)} language(s)")
        lang_options = {label: code for code, label in available_languages}
        selected_label = st.selectbox("Select Language", options=list(lang_options.keys()))
        language_code = lang_options[selected_label]
    else:
        st.warning("No languages found for this video.")


# -------------------------------------------------
# 6️⃣ Run Chatbot
# -------------------------------------------------
if st.button("Run Chatbot"):
    if not video_id or not query or not language_code:
        st.warning("⚠️ Please fill in all fields and select a language.")
    else:
        with st.spinner("Fetching transcript..."):
            transcript = get_transcript(video_id, language_code)

            if not transcript:
                st.error("❌ Could not fetch transcript.")
            else:
                st.success(f"✅ Transcript fetched ({len(transcript)} characters).")

                with st.spinner("Creating knowledge base..."):
                    retriever = create_vector_store(transcript).as_retriever(
                        search_type="mmr",
                        search_kwargs={"k": 5}
                    )
                    relevant_docs = retriever.invoke(query)
                    context_text = "\n\n".join(doc.page_content for doc in relevant_docs)

                prompt = prompt_template.invoke({'context':context_text, 'question':query})

                with st.spinner(f"Generating response using {model_choice}..."):
                    model, is_chat = build_model(model_choice, temperature)

                    try:
                        if is_chat:
                            # DeepSeek & OpenAI (chat-based)
                            response = model.invoke(prompt)
                            response_text = (
                                response.content if hasattr(response, "content") else str(response)
                            )
                        else:
                            # Flan-T5 (non-chat)
                            response = model(prompt)
                            if isinstance(response, list) and "generated_text" in response[0]:
                                response_text = response[0]["generated_text"]
                            else:
                                response_text = str(response)

                        st.text_area("🧠 Model Response", value=response_text, height=400)
                    except Exception as e:
                        st.error(f"Model generation failed: {e}") ## answer