Daksh0505 commited on
Commit
5e7cee3
·
verified ·
1 Parent(s): af49b5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -109
app.py CHANGED
@@ -3,76 +3,86 @@ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingF
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.prompts import PromptTemplate
6
- from youtube_transcript_api import YouTubeTranscriptApi
7
  import requests
8
  import os
9
 
10
- # 🔑 Environment variables
11
  api_key = os.getenv("HF_API_KEY")
12
  RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
13
- ytt_api = YouTubeTranscriptApi()
14
 
15
- # -----------------------------
16
- # List Available Languages
17
- # -----------------------------
18
- @st.cache_data
19
- def list_available_languages(video_id):
20
- """List available transcript languages using YouTubeTranscriptApi"""
21
- languages = []
22
- try:
23
- transcript_list = ytt_api.list(video_id) # ✅ use .list()
24
- for transcript in transcript_list: # transcript is an object
25
- lang_code = transcript.language_code
26
- lang_name = transcript.language
27
- is_generated = transcript.is_generated
28
- label = f"{lang_name} ({lang_code})" + (" - Auto-generated" if is_generated else "")
29
- languages.append((lang_code, label))
30
- return languages
31
- except Exception as e:
32
- st.warning(f"YouTubeTranscriptApi failed to list: {e}")
33
- return [("en", "English (en) - Default")]
34
 
35
- # -----------------------------
36
- # Fetch transcripts
37
- # -----------------------------
38
  @st.cache_data
39
- def get_transcript_youtube(video_id, language_code="en"):
40
- """Fetch transcript via YouTubeTranscriptApi safely, handling objects/dicts."""
 
 
 
 
 
 
41
  try:
42
- transcripts = ytt_api.list(video_id) # returns objects
43
- transcript_obj = transcripts.find_transcript([language_code])
44
- transcript_data = transcript_obj.fetch() # iterable of dicts or objects
45
- transcript = " ".join([t["text"] if isinstance(t, dict) else t.text for t in transcript_data])
46
- return transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  except Exception as e:
48
- st.warning(f"YouTubeTranscriptApi failed: {e}")
49
- return None
50
-
 
 
 
 
 
 
 
 
 
 
51
  @st.cache_data
52
- def get_transcript_rapidapi(video_id, language_code="en"):
53
- """Fetch transcript via RapidAPI"""
54
- if not RAPIDAPI_KEY:
55
- st.warning("RapidAPI key not set")
56
- return None
 
 
57
  try:
58
- url = "https://youtube-transcript3.p.rapidapi.com/"
59
- querystring = {"videoId": video_id, "lang": language_code} # ✅ correct param
60
- headers = {
61
- "x-rapidapi-key": RAPIDAPI_KEY,
62
- "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
63
- }
64
- response = requests.get(url, headers=headers, params=querystring, timeout=20)
65
- response.raise_for_status()
66
  data = response.json()
67
- transcript = " ".join([item.get("text", "") for item in data.get("transcript", [])])
68
- return transcript if transcript else None
 
 
 
69
  except Exception as e:
70
- st.error(f"RapidAPI transcript fetch failed: {e}")
71
  return None
72
 
73
- # -----------------------------
74
  # Vector Store
75
- # -----------------------------
76
  @st.cache_data
77
  def create_vector_store(transcript):
78
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
@@ -83,9 +93,7 @@ def create_vector_store(transcript):
83
  )
84
  return FAISS.from_documents(docs, embeddings)
85
 
86
- # -----------------------------
87
  # Build Model
88
- # -----------------------------
89
  def build_model(model_choice, temperature=0.7):
90
  if model_choice == "Flan-T5 (Free)":
91
  llm = HuggingFaceEndpoint(
@@ -112,9 +120,7 @@ def build_model(model_choice, temperature=0.7):
112
  )
113
  return ChatHuggingFace(llm=llm, temperature=temperature)
114
 
115
- # -----------------------------
116
  # Prompt Template
117
- # -----------------------------
118
  prompt_template = PromptTemplate(
119
  template=(
120
  "Answer the question based on the context below.\n\n"
@@ -125,72 +131,51 @@ prompt_template = PromptTemplate(
125
  input_variables=["context", "question"]
126
  )
127
 
128
- # -----------------------------
129
- # Streamlit UI
130
- # -----------------------------
131
- st.title("🎥 YouTube Transcript Chatbot")
132
 
133
- video_id = st.text_input("🎬 YouTube Video ID", value="lv1_-RER4_I")
134
- query = st.text_area("💬 Your Query", value="What is RAG?")
135
- model_choice = st.radio("🧠 Model to Use", ["Flan-T5 (Free)", "DeepSeek", "OpenAI"])
136
- temperature = st.slider("🔥 Temperature", 0, 100, value=50) / 100.0
137
 
138
- source_choice = st.radio(
139
- "📜 Transcript Source",
140
- ["Auto (RapidAPI → YouTubeTranscriptApi)", "RapidAPI", "YouTubeTranscriptApi"]
141
- )
142
-
143
- # Select language
144
  language_code = None
145
  if video_id:
146
- with st.spinner("🔎 Checking available transcript languages..."):
147
- available_langs = list_available_languages(video_id)
148
- if available_langs:
149
- st.success(f"Found {len(available_langs)} transcript(s)")
150
- lang_options = {label: code for code, label in available_langs}
151
- selected_label = st.selectbox("🌐 Select Transcript Language", options=list(lang_options.keys()))
 
152
  language_code = lang_options[selected_label]
153
  else:
154
- st.warning("No transcripts found for this video.")
155
 
156
- # Fetch transcript & answer
157
- if st.button("🚀 Run Chatbot"):
158
  if not video_id or not query or not language_code:
159
- st.warning("Please provide video ID, query, and select a language.")
160
  else:
161
- with st.spinner("🧾 Fetching transcript..."):
162
- transcript = None
163
- if source_choice == "RapidAPI":
164
- transcript = get_transcript_rapidapi(video_id, language_code)
165
- elif source_choice == "YouTubeTranscriptApi":
166
- transcript = get_transcript_youtube(video_id, language_code)
167
- else: # Auto mode
168
- transcript = get_transcript_rapidapi(video_id, language_code)
169
- if not transcript:
170
- transcript = get_transcript_youtube(video_id, language_code)
171
-
172
  if not transcript:
173
- st.error("Could not fetch transcript from any source.")
174
  else:
175
- st.success(f"Transcript fetched ({len(transcript)} characters).")
176
-
177
- with st.spinner("⚙️ Generating response..."):
178
- retriever = create_vector_store(transcript).as_retriever(search_type="mmr", search_kwargs={"k": 5})
 
 
 
179
  relevant_docs = retriever.invoke(query)
180
  context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
181
  prompt = prompt_template.format(context=context_text, question=query)
182
-
183
  model = build_model(model_choice, temperature)
184
  response = model.invoke(prompt)
185
  response_text = response.content if hasattr(response, 'content') else str(response)
186
- st.text_area("🧩 Model Response", value=response_text, height=400)
187
-
188
- # Sidebar
189
- with st.sidebar:
190
- st.header("ℹ️ About this App")
191
- st.write("""
192
- - Uses both **RapidAPI** and **YouTubeTranscriptApi**
193
- - Correctly detects transcript languages dynamically
194
- - RAG-based Q&A powered by Hugging Face models
195
- - Models supported: Flan-T5 (Free), DeepSeek, OpenAI (via HF)
196
- """)
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import FAISS
5
  from langchain.prompts import PromptTemplate
 
6
  import requests
7
  import os
8
 
9
+ # Environment variables
10
  api_key = os.getenv("HF_API_KEY")
11
  RAPIDAPI_KEY = (os.getenv("RAPIDAPI_KEY") or "").strip()
 
12
 
13
+ if not RAPIDAPI_KEY:
14
+ st.error("RAPIDAPI_KEY not set")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Check available languages via RapidAPI
 
 
17
  @st.cache_data
18
+ def get_available_languages(video_id):
19
+ """Check available transcript languages for a video via RapidAPI"""
20
+ url = "https://youtube-transcript3.p.rapidapi.com/api/languages"
21
+ querystring = {"videoId": video_id}
22
+ headers = {
23
+ "x-rapidapi-key": RAPIDAPI_KEY,
24
+ "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
25
+ }
26
  try:
27
+ response = requests.get(url, headers=headers, params=querystring, timeout=10)
28
+ if response.status_code == 200:
29
+ data = response.json()
30
+ if data.get("success") and "languages" in data:
31
+ languages = []
32
+ for lang in data["languages"]:
33
+ code = lang.get("code", "")
34
+ name = lang.get("name", "")
35
+ languages.append((code, f"{name} ({code})"))
36
+ return languages
37
+ # Fallback to common languages if API fails
38
+ return [
39
+ ("en", "English (en)"),
40
+ ("hi", "Hindi (hi)"),
41
+ ("es", "Spanish (es)"),
42
+ ("fr", "French (fr)"),
43
+ ("de", "German (de)"),
44
+ ("ja", "Japanese (ja)"),
45
+ ("pt", "Portuguese (pt)"),
46
+ ("ru", "Russian (ru)")
47
+ ]
48
  except Exception as e:
49
+ st.warning(f"Could not fetch languages: {e}. Using common languages.")
50
+ return [
51
+ ("en", "English (en)"),
52
+ ("hi", "Hindi (hi)"),
53
+ ("es", "Spanish (es)"),
54
+ ("fr", "French (fr)"),
55
+ ("de", "German (de)"),
56
+ ("ja", "Japanese (ja)"),
57
+ ("pt", "Portuguese (pt)"),
58
+ ("ru", "Russian (ru)")
59
+ ]
60
+
61
+ # Transcript Fetcher
62
  @st.cache_data
63
+ def get_transcript(video_id, language_code="en"):
64
+ url = "https://youtube-transcript3.p.rapidapi.com/api/transcript"
65
+ querystring = {"videoId": video_id, "lang": language_code}
66
+ headers = {
67
+ "x-rapidapi-key": RAPIDAPI_KEY,
68
+ "x-rapidapi-host": "youtube-transcript3.p.rapidapi.com"
69
+ }
70
  try:
71
+ response = requests.get(url, headers=headers, params=querystring, timeout=10)
72
+ if response.status_code != 200:
73
+ st.error(f"API Error: {response.status_code}")
74
+ return None
 
 
 
 
75
  data = response.json()
76
+ if data.get("success") and "transcript" in data:
77
+ return ' '.join([item.get('text', '') for item in data["transcript"]])
78
+ else:
79
+ st.warning("Unexpected API response format")
80
+ return None
81
  except Exception as e:
82
+ st.error(f"Error: {str(e)}")
83
  return None
84
 
 
85
  # Vector Store
 
86
  @st.cache_data
87
  def create_vector_store(transcript):
88
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 
93
  )
94
  return FAISS.from_documents(docs, embeddings)
95
 
 
96
  # Build Model
 
97
  def build_model(model_choice, temperature=0.7):
98
  if model_choice == "Flan-T5 (Free)":
99
  llm = HuggingFaceEndpoint(
 
120
  )
121
  return ChatHuggingFace(llm=llm, temperature=temperature)
122
 
 
123
  # Prompt Template
 
124
  prompt_template = PromptTemplate(
125
  template=(
126
  "Answer the question based on the context below.\n\n"
 
131
  input_variables=["context", "question"]
132
  )
133
 
134
+ # UI
135
+ st.title("YouTube Transcript Chatbot")
 
 
136
 
137
+ video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
138
+ query = st.text_area("Your Query", value="What is RAG?")
139
+ model_choice = st.radio("Model to Use", ["Flan-T5 (Free)", "DeepSeek", "OpenAI"])
140
+ temperature = st.slider("Temperature", 0, 100, value=50) / 100.0
141
 
142
+ # Get available languages for this video
 
 
 
 
 
143
  language_code = None
144
  if video_id:
145
+ with st.spinner("Checking available languages..."):
146
+ available_languages = get_available_languages(video_id)
147
+
148
+ if available_languages:
149
+ st.success(f"Found {len(available_languages)} language(s)")
150
+ lang_options = {label: code for code, label in available_languages}
151
+ selected_label = st.selectbox("Select Language", options=list(lang_options.keys()))
152
  language_code = lang_options[selected_label]
153
  else:
154
+ st.warning("No languages found")
155
 
156
+ if st.button("Run Chatbot"):
 
157
  if not video_id or not query or not language_code:
158
+ st.warning("Please fill in all fields and select a language.")
159
  else:
160
+ with st.spinner("Fetching transcript..."):
161
+ transcript = get_transcript(video_id, language_code)
162
+
 
 
 
 
 
 
 
 
163
  if not transcript:
164
+ st.error("Could not fetch transcript.")
165
  else:
166
+ st.success(f"Transcript fetched ({len(transcript)} characters).")
167
+
168
+ with st.spinner("Generating response..."):
169
+ retriever = create_vector_store(transcript).as_retriever(
170
+ search_type="mmr",
171
+ search_kwargs={"k": 5}
172
+ )
173
  relevant_docs = retriever.invoke(query)
174
  context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
175
  prompt = prompt_template.format(context=context_text, question=query)
176
+
177
  model = build_model(model_choice, temperature)
178
  response = model.invoke(prompt)
179
  response_text = response.content if hasattr(response, 'content') else str(response)
180
+
181
+ st.text_area("Response", value=response_text, height=400)