Spaces:

WellGoods
/

VibeThinker

Sleeping

App Files Files Community

VladBoyko commited on 24 days ago

Commit

ebe1956

verified ·

1 Parent(s): fa4b185

Update app.py

Browse files

Files changed (1) hide show

app.py +323 -257

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import re
 import time
@@ -35,10 +36,11 @@ class VibeThinkerModel:
             print(f"❌ Error loading model: {e}")
             raise
-    def generate_response(self, prompt, temperature=0.6, max_new_tokens=8192, max_thinking_tokens=4096):
-        """Generate response with thinking length control"""
         if not self.model or not self.tokenizer:
-            return "Model not loaded!", 0, 0, 0, None
         try:
             start_time = time.time()
@@ -61,232 +63,342 @@ Keep reasoning under {max_thinking_tokens} tokens. Be direct and avoid repetitio
             inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
             prompt_length = inputs.input_ids.shape[1]
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    top_p=0.95,
-                    top_k=50,
-                    do_sample=True,
-                    repetition_penalty=1.1,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                )
-            full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            if "<|im_start|>assistant" in full_output:
-                generated_text = full_output.split("<|im_start|>assistant")[-1].strip()
-            else:
-                generated_text = full_output[len(formatted_prompt):].strip()
-            # Store original before truncation
-            original_text = generated_text
-            truncated_content = None
-            # Check for loops and truncate if needed
-            if self._detect_loop(generated_text):
-                generated_text = self._truncate_loop(generated_text)
-                # Calculate what was removed
-                if len(generated_text) < len(original_text):
-                    truncated_content = original_text[len(generated_text):].strip()
-                generated_text += "\n\n⚠️ *[Repetitive content detected and truncated]*"
-            generation_time = time.time() - start_time
-            completion_length = outputs.shape[1] - prompt_length
-            return generated_text, prompt_length, completion_length, generation_time, truncated_content
         except Exception as e:
-            return f"Error during generation: {str(e)}", 0, 0, 0, None
-    def _detect_loop(self, text):
         words = text.split()
-        if len(words) < 20:
             return False
-        for length in [10, 15, 20]:
-            if len(words) < length * 3:
                 continue
-            for i in range(len(words) - length * 3):
-                phrase = ' '.join(words[i:i+length])
-                rest = ' '.join(words[i+length:])
-                if rest.count(phrase) >= 2:
                     return True
         return False
-    def _truncate_loop(self, text):
         words = text.split()
-        for length in [10, 15, 20]:
-            if len(words) < length * 2:
                 continue
-            for i in range(len(words) - length * 2):
-                phrase = ' '.join(words[i:i+length])
-                rest_start = i + length
-                rest = ' '.join(words[rest_start:])
-                if phrase in rest:
-                    return ' '.join(words[:rest_start])
         return text
-def parse_model_output(text, truncated_content=None):
-    """
-    Parse model output into sections with proper edge case handling
-    """
-    loop_warning = ""
-    loop_details_html = ""
-    if "[Repetitive content detected and truncated]" in text:
-        text = text.replace("⚠️ *[Repetitive content detected and truncated]*", "")
-        # Create expandable section for truncated content
-        if truncated_content:
-            truncated_escaped = truncated_content.replace('<', '&lt;').replace('>', '&gt;')
-            truncated_word_count = len(truncated_content.split())
-            loop_details_html = f"""
-            <details style="background: #fff3cd; border: 2px solid #ffc107; border-radius: 8px; padding: 16px; margin-top: 12px;">
-                <summary style="cursor: pointer; font-weight: 600; font-size: 14px; color: #856404; user-select: none; display: flex; align-items: center; gap: 8px;">
-                    <span style="font-size: 18px;">⚠️</span>
-                    <span>Truncated Repetitive Content ({truncated_word_count} words removed)</span>
-                    <span style="margin-left: auto; font-size: 12px; font-weight: normal;">▶ Click to view what was removed</span>
-                </summary>
-                <div style="margin-top: 12px; padding-top: 12px; border-top: 2px solid #ffc107; color: #856404; line-height: 1.6; white-space: pre-wrap; font-size: 13px; font-family: 'SF Mono', Monaco, 'Courier New', monospace; max-height: 400px; overflow-y: auto;">
-{truncated_escaped}
-                </div>
-            </details>
-            """
-        loop_warning = loop_details_html
-    # Extract all code blocks
-    code_pattern = r'```(\w+)?\n(.*?)```'
-    code_blocks = re.findall(code_pattern, text, re.DOTALL)
-    # Remove code blocks from text
-    text_without_code = re.sub(code_pattern, '###CODE_PLACEHOLDER###', text, flags=re.DOTALL)
-    # Try to find thinking section
-    thinking_content = ""
-    explanation_content = text_without_code
-    # Check for explicit thinking tags
-    thinking_patterns = [
-        r'<think>(.*?)</think>',
-        r'<thinking>(.*?)</thinking>',
-    ]
-    for pattern in thinking_patterns:
-        match = re.search(pattern, text_without_code, re.DOTALL | re.IGNORECASE)
-        if match:
-            thinking_content = match.group(1).strip()
-            explanation_content = re.sub(pattern, '', text_without_code, flags=re.DOTALL | re.IGNORECASE).strip()
-            break
-    # If no explicit tags, try to detect thinking by content
-    if not thinking_content:
-        split_patterns = [
-            r'^(.*?)(?=\n\n(?:Solution|Implementation|Code|Here\'s|Let me|Let\'s code|Final code))',
-            r'^(.*?)(?=###CODE_PLACEHOLDER###)',
-        ]
-        for pattern in split_patterns:
-            match = re.search(pattern, text_without_code, re.DOTALL | re.IGNORECASE)
-            if match:
-                potential_thinking = match.group(1).strip()
-                if len(potential_thinking) > 150:
-                    thinking_lower = potential_thinking.lower()
-                    reasoning_keywords = [
-                        'let me think', 'let\'s think', 'first', 'approach', 'idea',
-                        'we can', 'we need', 'step', 'analyze', 'consider', 'observation'
-                    ]
-                    if any(keyword in thinking_lower for keyword in reasoning_keywords):
-                        thinking_content = potential_thinking
-                        explanation_content = text_without_code[len(potential_thinking):].strip()
-                        break
-    # Clean up explanation
-    explanation_content = explanation_content.replace('###CODE_PLACEHOLDER###', '').strip()
-    explanation_content = re.sub(r'\n{3,}', '\n\n', explanation_content)
-    explanation_content = re.sub(
-        r'(?:Implementation|Code|Solution|Here\'s the code|Final code):\s*$',
-        '',
-        explanation_content,
-        flags=re.IGNORECASE
-    ).strip()
-    # Handle boxed answers
-    answer_match = re.search(r'\\boxed\{([^}]+)\}', explanation_content)
-    if answer_match:
-        explanation_content = f"**Final Answer:** {answer_match.group(1)}\n\n{explanation_content}"
-    explanation_content += loop_warning
-    return thinking_content, explanation_content, code_blocks
-def format_output_html(thinking, explanation, code_blocks, prompt_tokens, completion_tokens, generation_time):
-    """
-    Format output with harmonized design and edge case handling
     """
-    total_tokens = prompt_tokens + completion_tokens
-    thinking_tokens_est = len(thinking.split()) * 1.3 if thinking else 0
-    tokens_per_sec = completion_tokens / generation_time if generation_time > 0 else 0
-    # Card style for consistent sections
-    card_base_style = "background: #ffffff; border-radius: 12px; padding: 24px; margin-bottom: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.08);"
-    # Thinking section (collapsed, only show if exists)
-    thinking_html = ""
     if thinking and len(thinking.strip()) > 0:
         thinking_escaped = thinking.replace('<', '&lt;').replace('>', '&gt;')
-        thinking_html = f"""
-        <details style="{card_base_style} border-left: 4px solid #6c757d;">
-            <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #495057; user-select: none; display: flex; align-items: center; gap: 10px; padding: 4px 0;">
                 <span style="font-size: 20px;">🧠</span>
                 <span>Reasoning Process</span>
-                <span style="margin-left: auto; font-size: 13px; color: #6c757d; font-weight: normal;">~{int(thinking_tokens_est):,} tokens • Click to expand ▼</span>
             </summary>
             <div style="margin-top: 20px; padding-top: 20px; border-top: 2px solid #e9ecef; color: #495057; line-height: 1.8; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, 'Courier New', monospace;">
-{thinking_escaped}
             </div>
         </details>
         """
-    # Explanation section (only show if has meaningful content)
-    # Note: explanation may contain the loop warning HTML which should NOT be escaped
-    explanation_html = ""
     if explanation and len(explanation.strip()) > 10:
-        # Split into text and HTML parts
-        # If it contains our loop warning HTML, don't escape that part
-        if '<details style="background: #fff3cd' in explanation:
-            # Split at the warning
-            parts = explanation.split('<details style="background: #fff3cd', 1)
-            text_part = parts[0].replace('<', '&lt;').replace('>', '&gt;')
-            html_part = '<details style="background: #fff3cd' + parts[1] if len(parts) > 1 else ''
-            explanation_display = text_part + html_part
-        else:
-            explanation_display = explanation.replace('<', '&lt;').replace('>', '&gt;')
-        explanation_html = f"""
-        <div style="{card_base_style} border-left: 4px solid #28a745;">
-            <h3 style="margin: 0 0 16px 0; color: #28a745; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 10px;">
-                <span style="font-size: 22px;">✅</span>
                 <span>Solution Explanation</span>
             </h3>
             <div style="color: #495057; line-height: 1.8; font-size: 15px; white-space: pre-wrap;">
-{explanation_display}
             </div>
         </div>
         """
-    # Code section (only show if code exists)
-    code_html = ""
     if code_blocks and len(code_blocks) > 0:
         code_blocks_html = ""
         for idx, (lang, code) in enumerate(code_blocks):
             lang_display = lang if lang else "code"
-            code_id = f"code_{idx}"
             code_clean = code.strip()
             code_blocks_html += f"""
             <div style="margin-bottom: 16px; background: #1e1e1e; border-radius: 10px; overflow: hidden; box-shadow: 0 2px 8px rgba(0,0,0,0.15);">
                 <div style="background: #2d2d2d; padding: 12px 20px; color: #e0e0e0; font-weight: 600; font-size: 13px; display: flex; justify-content: space-between; align-items: center;">
@@ -295,13 +407,13 @@ def format_output_html(thinking, explanation, code_blocks, prompt_tokens, comple
                         <span>{lang_display}</span>
                     </span>
                     <div style="display: flex; gap: 8px;">
-                        <button onclick="navigator.clipboard.writeText(document.getElementById('{code_id}').textContent); this.textContent='✓ Copied'; setTimeout(() => this.textContent='📋 Copy', 2000)"
                                 style="background: #28a745; color: white; border: none; padding: 7px 16px; border-radius: 6px; cursor: pointer; font-size: 12px; font-weight: 500; transition: all 0.2s;"
                                 onmouseover="if(this.textContent==='📋 Copy') this.style.background='#218838'"
                                 onmouseout="if(this.textContent==='📋 Copy') this.style.background='#28a745'">
                             📋 Copy
                         </button>
-                        <button onclick="downloadCode(document.getElementById('{code_id}').textContent, '{lang_display}')"
                                 style="background: #007bff; color: white; border: none; padding: 7px 16px; border-radius: 6px; cursor: pointer; font-size: 12px; font-weight: 500; transition: all 0.2s;"
                                 onmouseover="this.style.background='#0056b3'"
                                 onmouseout="this.style.background='#007bff'">
@@ -309,14 +421,14 @@ def format_output_html(thinking, explanation, code_blocks, prompt_tokens, comple
                         </button>
                     </div>
                 </div>
-                <pre style="margin: 0; padding: 20px; color: #d4d4d4; overflow-x: auto; font-family: 'SF Mono', Monaco, 'Courier New', monospace; font-size: 14px; line-height: 1.6; background: #1e1e1e;"><code id="{code_id}">{code_clean}</code></pre>
             </div>
             """
-        code_html = f"""
-        <div style="{card_base_style} border-left: 4px solid #007bff;">
-            <h3 style="margin: 0 0 20px 0; color: #007bff; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 10px;">
-                <span style="font-size: 22px;">📝</span>
                 <span>Implementation</span>
             </h3>
             {code_blocks_html}
@@ -345,85 +457,31 @@ def format_output_html(thinking, explanation, code_blocks, prompt_tokens, comple
         </script>
         """
-    # If no explanation but has code, add a minimal message
-    if not explanation_html and code_html:
-        explanation_html = f"""
-        <div style="{card_base_style} border-left: 4px solid #6c757d;">
-            <p style="color: #6c757d; font-size: 14px; margin: 0; font-style: italic;">
-                No explanation provided - see implementation below.
-            </p>
-        </div>
-        """
-    html = f"""
-    <div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; max-width: 100%; margin: 0 auto; background: #f8f9fa; padding: 20px; border-radius: 12px;">
-        <!-- Stats Card -->
-        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 24px; border-radius: 12px; margin-bottom: 20px; color: white; box-shadow: 0 4px 12px rgba(102,126,234,0.3);">
-            <h3 style="margin: 0 0 16px 0; font-size: 17px; font-weight: 600; opacity: 0.95;">📊 Generation Stats</h3>
-            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(130px, 1fr)); gap: 12px; font-size: 13px;">
-                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
-                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Time</div>
-                    <div style="font-size: 22px; font-weight: 700;">{generation_time:.1f}s</div>
-                </div>
-                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
-                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Speed</div>
-                    <div style="font-size: 22px; font-weight: 700;">{tokens_per_sec:.1f} t/s</div>
-                </div>
-                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
-                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Prompt</div>
-                    <div style="font-size: 22px; font-weight: 700;">{prompt_tokens:,}</div>
-                </div>
-                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
-                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Output</div>
-                    <div style="font-size: 22px; font-weight: 700;">{completion_tokens:,}</div>
-                </div>
-                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
-                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Thinking</div>
-                    <div style="font-size: 22px; font-weight: 700;">~{int(thinking_tokens_est):,}</div>
-                </div>
-                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
-                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Total</div>
-                    <div style="font-size: 22px; font-weight: 700;">{total_tokens:,}</div>
-                </div>
-            </div>
-        </div>
-        <!-- Content Sections -->
-        {thinking_html}
-        {explanation_html}
-        {code_html}
-    </div>
-    """
     return html
-# Initialize model
-print("🔄 Initializing VibeThinker-1.5B...")
-vibe_model = VibeThinkerModel()
-def generate_solution(prompt, temperature=0.6, max_tokens=8192, max_thinking_tokens=4096, progress=gr.Progress()):
-    """Generate and format solution with progress tracking"""
     if not prompt.strip():
-        return "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
-    progress(0, desc="🔄 Initializing...")
-    progress(0.2, desc="🧠 Generating solution...")
-    response, prompt_tokens, completion_tokens, gen_time, truncated = vibe_model.generate_response(
         prompt,
         temperature=temperature,
         max_new_tokens=max_tokens,
         max_thinking_tokens=max_thinking_tokens
-    )
-    progress(0.8, desc="📝 Formatting output...")
-    thinking, explanation, code_blocks = parse_model_output(response, truncated)
-    html_output = format_output_html(thinking, explanation, code_blocks, prompt_tokens, completion_tokens, gen_time)
-    progress(1.0, desc="✅ Complete!")
-    return html_output
 # Create Gradio interface
 with gr.Blocks(
@@ -438,6 +496,8 @@ with gr.Blocks(
     🎯 **Best for**: Python algorithmic problems with clear input/output specifications
     ⚠️ **Note**: This model is specialized for competitive programming, not general software development
     """)
     with gr.Row():
@@ -468,11 +528,17 @@ with gr.Blocks(
                 - Higher thinking tokens (4096-8192) for complex reasoning
                 - Temperature 0.6 balances creativity and accuracy
-                **Output Structure:**
-                - 🧠 **Reasoning** (collapsed) - Model's thinking process
-                - ✅ **Explanation** - Solution approach without code
-                - 📝 **Implementation** - Clean code with copy/download
-                - ⚠️ **Truncated Content** (if detected) - View removed repetitions
                 """)
             generate_btn = gr.Button("🚀 Generate Solution", variant="primary", size="lg")
@@ -482,7 +548,7 @@ with gr.Blocks(
             output_html = gr.HTML(label="Solution")
     generate_btn.click(
-        fn=generate_solution,
         inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
         outputs=output_html
     )

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
 import re
 import time
             print(f"❌ Error loading model: {e}")
             raise
+    def generate_response_streaming(self, prompt, temperature=0.6, max_new_tokens=8192, max_thinking_tokens=4096):
+        """Generate response with streaming and real-time loop detection"""
         if not self.model or not self.tokenizer:
+            yield "Model not loaded!", None, False
+            return
         try:
             start_time = time.time()
             inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
             prompt_length = inputs.input_ids.shape[1]
+            # Create streamer
+            streamer = TextIteratorStreamer(
+                self.tokenizer,
+                skip_prompt=True,
+                skip_special_tokens=True
+            )
+            # Generation kwargs
+            generation_kwargs = dict(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=0.95,
+                top_k=50,
+                do_sample=True,
+                repetition_penalty=1.1,
+                pad_token_id=self.tokenizer.eos_token_id,
+                streamer=streamer,
+            )
+            # Start generation in background thread
+            thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+            thread.start()
+            # Stream tokens with real-time loop detection
+            generated_text = ""
+            last_loop_check = ""
+            loop_detected = False
+            for new_text in streamer:
+                if loop_detected:
+                    break  # Stop streaming if loop detected
+                generated_text += new_text
+                generation_time = time.time() - start_time
+                tokens_generated = len(self.tokenizer.encode(generated_text))
+                # Check for loops every ~50 tokens
+                if len(generated_text) - len(last_loop_check) > 200:
+                    if self._detect_loop_realtime(generated_text):
+                        loop_detected = True
+                        # Truncate at loop point
+                        generated_text = self._truncate_loop(generated_text)
+                    last_loop_check = generated_text
+                # Yield current state
+                yield generated_text, {
+                    "prompt_tokens": prompt_length,
+                    "tokens_generated": tokens_generated,
+                    "generation_time": generation_time,
+                    "is_complete": False
+                }, loop_detected
+            # Wait for thread to complete
+            thread.join()
+            # Final yield
+            final_time = time.time() - start_time
+            final_tokens = len(self.tokenizer.encode(generated_text))
+            yield generated_text, {
+                "prompt_tokens": prompt_length,
+                "completion_tokens": final_tokens,
+                "generation_time": final_time,
+                "tokens_per_sec": final_tokens / final_time if final_time > 0 else 0,
+                "is_complete": True
+            }, loop_detected
         except Exception as e:
+            yield f"Error during generation: {str(e)}", None, False
+    def _detect_loop_realtime(self, text, check_window=200, min_repetitions=5):
+        """Quick loop detection for real-time streaming.
+        Real loops repeat SHORT phrases (3-9 words) MANY times (5+ repetitions).
+        Example: "Wait, let me reconsider... Wait, let me reconsider... Wait, let me reconsider..."
+        Args:
+            text: Full generated text
+            check_window: Number of recent words to check (default: 200)
+            min_repetitions: Minimum repetitions to consider a loop (default: 5)
+        Returns:
+            bool: True if a loop is detected
+        """
         words = text.split()
+        if len(words) < 30:
             return False
+        # Check last N words for repetitive patterns
+        recent_words = words[-check_window:] if len(words) > check_window else words
+        text_to_check = ' '.join(recent_words)
+        # Look for short phrases (3-9 words) repeated multiple times
+        for phrase_len in range(3, 10):  # 3 to 9 words
+            if len(recent_words) < phrase_len * min_repetitions:
                 continue
+            # Check different starting positions
+            for i in range(len(recent_words) - phrase_len):
+                phrase = ' '.join(recent_words[i:i+phrase_len])
+                # Count how many times this phrase appears
+                count = text_to_check.count(phrase)
+                # If phrase appears 5+ times, it's a loop
+                if count >= min_repetitions:
                     return True
         return False
+    def _truncate_loop(self, text, min_repetitions=5):
+        """Truncate text at the point where loop starts.
+        Find where a 3-9 word phrase starts repeating 5+ times and truncate there.
+        """
         words = text.split()
+        # Check for short phrases (3-9 words) repeated multiple times
+        for phrase_len in range(3, 10):
+            if len(words) < phrase_len * min_repetitions:
                 continue
+            # Scan through text to find loop start point
+            for i in range(len(words) - phrase_len * min_repetitions):
+                phrase = ' '.join(words[i:i+phrase_len])
+                # Count consecutive repetitions starting from position i
+                repetition_count = 0
+                check_pos = i
+                while check_pos + phrase_len <= len(words):
+                    check_phrase = ' '.join(words[check_pos:check_pos+phrase_len])
+                    if check_phrase == phrase:
+                        repetition_count += 1
+                        check_pos += phrase_len
+                    else:
+                        break
+                # If we found 5+ consecutive repetitions, truncate at loop start
+                if repetition_count >= min_repetitions:
+                    return ' '.join(words[:i])
+        # If no clear loop found, return original
         return text
+# Initialize model
+print("🔄 Initializing VibeThinker-1.5B...")
+vibe_model = VibeThinkerModel()
+class IntelligentStreamParser:
+    """Parse streaming output in real-time into sections"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.thinking = ""
+        self.explanation = ""
+        self.code_blocks = []
+        self.current_code_lang = None
+        self.current_code_content = ""
+        self.in_code_block = False
+        self.phase = "thinking"  # thinking -> explanation -> code
+    def parse_chunk(self, full_text):
+        """Parse text in real-time as it streams"""
+        # Detect code blocks with regex
+        code_pattern = r'```(\w+)?\n(.*?)```'
+        found_codes = re.findall(code_pattern, full_text, re.DOTALL)
+        # Remove code blocks from text for section detection
+        text_without_code = re.sub(code_pattern, '###CODE_PLACEHOLDER###', full_text, flags=re.DOTALL)
+        # Try to split thinking and explanation
+        thinking_content = ""
+        explanation_content = text_without_code
+        # Check for explicit markers
+        if "Solution:" in text_without_code or "Explanation:" in text_without_code:
+            parts = re.split(r'(?:Solution|Explanation):', text_without_code, maxsplit=1)
+            if len(parts) == 2:
+                thinking_content = parts[0].strip()
+                explanation_content = parts[1].strip()
+        elif "```" in text_without_code:
+            # Split at first code block
+            parts = text_without_code.split("###CODE_PLACEHOLDER###", maxsplit=1)
+            if len(parts) == 2 and len(parts[0]) > 100:
+                # Check if first part looks like thinking
+                first_part_lower = parts[0].lower()
+                thinking_keywords = ['approach', 'idea', 'step', 'first', "let's", 'plan', 'strategy']
+                if any(kw in first_part_lower for kw in thinking_keywords):
+                    thinking_content = parts[0].strip()
+                    explanation_content = parts[1].strip()
+        # Clean up placeholders
+        explanation_content = explanation_content.replace('###CODE_PLACEHOLDER###', '').strip()
+        return {
+            'thinking': thinking_content,
+            'explanation': explanation_content,
+            'code_blocks': found_codes
+        }
+parser = IntelligentStreamParser()
+def format_streaming_html(generated_text, stats, loop_detected, is_generating=True):
+    """Format streaming output with intelligent parsing"""
+    # Parse the current text
+    parsed = parser.parse_chunk(generated_text)
+    thinking = parsed['thinking']
+    explanation = parsed['explanation']
+    code_blocks = parsed['code_blocks']
+    # Stats
+    if stats:
+        prompt_tokens = stats.get('prompt_tokens', 0)
+        tokens_generated = stats.get('tokens_generated', 0) or stats.get('completion_tokens', 0)
+        generation_time = stats.get('generation_time', 0)
+        tokens_per_sec = stats.get('tokens_per_sec', 0) or (tokens_generated / generation_time if generation_time > 0 else 0)
+        is_complete = stats.get('is_complete', False)
+    else:
+        prompt_tokens = tokens_generated = generation_time = tokens_per_sec = 0
+        is_complete = False
+    thinking_tokens_est = len(thinking.split()) * 1.3 if thinking else 0
+    total_tokens = prompt_tokens + tokens_generated
+    # Card style
+    card_base_style = "background: #ffffff; border-radius: 12px; padding: 24px; margin-bottom: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.08);"
+    # Blink cursor CSS
+    cursor_style = """
+    <style>
+    @keyframes blink {
+        0%, 49% { opacity: 1; }
+        50%, 100% { opacity: 0; }
+    }
+    .cursor {
+        display: inline-block;
+        width: 2px;
+        height: 1em;
+        background: #667eea;
+        margin-left: 2px;
+        animation: blink 0.7s infinite;
+    }
+    </style>
+    """
+    # Status message
+    status_emoji = "✅" if is_complete else "🔄"
+    status_text = "Complete" if is_complete else "Generating..."
+    # Stats card
+    html = f"""
+    {cursor_style}
+    <div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; max-width: 100%; margin: 0 auto; background: #f8f9fa; padding: 20px; border-radius: 12px;">
+        <!-- Stats Card -->
+        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 24px; border-radius: 12px; margin-bottom: 20px; color: white; box-shadow: 0 4px 12px rgba(102,126,234,0.3);">
+            <h3 style="margin: 0 0 16px 0; font-size: 17px; font-weight: 600; opacity: 0.95;">{status_emoji} {status_text}</h3>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(130px, 1fr)); gap: 12px; font-size: 13px;">
+                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
+                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Time</div>
+                    <div style="font-size: 22px; font-weight: 700;">{generation_time:.1f}s</div>
+                </div>
+                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
+                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Speed</div>
+                    <div style="font-size: 22px; font-weight: 700;">{tokens_per_sec:.1f} t/s</div>
+                </div>
+                <div style="background: rgba(255,255,255,0.15); padding: 14px; border-radius: 8px; backdrop-filter: blur(10px);">
+                    <div style="opacity: 0.85; font-size: 11px; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px;">Tokens</div>
+                    <div style="font-size: 22px; font-weight: 700;">{tokens_generated:,}</div>
+                </div>
+            </div>
+        </div>
     """
+    # Loop warning (if detected)
+    if loop_detected:
+        html += f"""
+        <div style="{card_base_style} border-left: 4px solid #ffc107; background: #fff3cd;">
+            <div style="color: #856404; font-weight: 600; display: flex; align-items: center; gap: 8px;">
+                <span style="font-size: 20px;">⚠️</span>
+                <span>Loop Detected - Generation stopped to prevent repetition</span>
+            </div>
+        </div>
+        """
+    # Thinking section (collapsed if exists)
     if thinking and len(thinking.strip()) > 0:
         thinking_escaped = thinking.replace('<', '&lt;').replace('>', '&gt;')
+        cursor_html = '<span class="cursor"></span>' if is_generating and not explanation and not code_blocks else ''
+        html += f"""
+        <details style="{card_base_style} border-left: 4px solid #8b5cf6;">
+            <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #7c3aed; user-select: none; display: flex; align-items: center; gap: 10px; padding: 4px 0;">
                 <span style="font-size: 20px;">🧠</span>
                 <span>Reasoning Process</span>
+                <span style="margin-left: auto; font-size: 13px; color: #8b5cf6; font-weight: normal;">~{int(thinking_tokens_est):,} tokens • Click to expand ▼</span>
             </summary>
             <div style="margin-top: 20px; padding-top: 20px; border-top: 2px solid #e9ecef; color: #495057; line-height: 1.8; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, 'Courier New', monospace;">
+{thinking_escaped}{cursor_html}
             </div>
         </details>
         """
+    # Explanation section
     if explanation and len(explanation.strip()) > 10:
+        explanation_escaped = explanation.replace('<', '&lt;').replace('>', '&gt;')
+        cursor_html = '<span class="cursor"></span>' if is_generating and not code_blocks else ''
+        html += f"""
+        <div style="{card_base_style} border-left: 4px solid #10b981;">
+            <h3 style="margin: 0 0 16px 0; color: #10b981; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 10px;">
+                <span style="font-size: 22px;">💡</span>
                 <span>Solution Explanation</span>
             </h3>
             <div style="color: #495057; line-height: 1.8; font-size: 15px; white-space: pre-wrap;">
+{explanation_escaped}{cursor_html}
             </div>
         </div>
         """
+    # Code blocks
     if code_blocks and len(code_blocks) > 0:
         code_blocks_html = ""
         for idx, (lang, code) in enumerate(code_blocks):
             lang_display = lang if lang else "code"
+            code_id = f"code_{idx}_{int(time.time()*1000)}"
             code_clean = code.strip()
+            # Add cursor to last code block if generating
+            cursor_html = '<span class="cursor"></span>' if is_generating and idx == len(code_blocks) - 1 else ''
             code_blocks_html += f"""
             <div style="margin-bottom: 16px; background: #1e1e1e; border-radius: 10px; overflow: hidden; box-shadow: 0 2px 8px rgba(0,0,0,0.15);">
                 <div style="background: #2d2d2d; padding: 12px 20px; color: #e0e0e0; font-weight: 600; font-size: 13px; display: flex; justify-content: space-between; align-items: center;">
                         <span>{lang_display}</span>
                     </span>
                     <div style="display: flex; gap: 8px;">
+                        <button onclick="navigator.clipboard.writeText(document.getElementById('{code_id}').textContent.replace('▌', '')); this.textContent='✓ Copied'; setTimeout(() => this.textContent='📋 Copy', 2000)"
                                 style="background: #28a745; color: white; border: none; padding: 7px 16px; border-radius: 6px; cursor: pointer; font-size: 12px; font-weight: 500; transition: all 0.2s;"
                                 onmouseover="if(this.textContent==='📋 Copy') this.style.background='#218838'"
                                 onmouseout="if(this.textContent==='📋 Copy') this.style.background='#28a745'">
                             📋 Copy
                         </button>
+                        <button onclick="downloadCode(document.getElementById('{code_id}').textContent.replace('▌', ''), '{lang_display}')"
                                 style="background: #007bff; color: white; border: none; padding: 7px 16px; border-radius: 6px; cursor: pointer; font-size: 12px; font-weight: 500; transition: all 0.2s;"
                                 onmouseover="this.style.background='#0056b3'"
                                 onmouseout="this.style.background='#007bff'">
                         </button>
                     </div>
                 </div>
+                <pre style="margin: 0; padding: 20px; color: #d4d4d4; overflow-x: auto; font-family: 'SF Mono', Monaco, 'Courier New', monospace; font-size: 14px; line-height: 1.6; background: #1e1e1e;"><code id="{code_id}">{code_clean}{cursor_html}</code></pre>
             </div>
             """
+        html += f"""
+        <div style="{card_base_style} border-left: 4px solid #6b7280;">
+            <h3 style="margin: 0 0 20px 0; color: #6b7280; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 10px;">
+                <span style="font-size: 22px;">💻</span>
                 <span>Implementation</span>
             </h3>
             {code_blocks_html}
         </script>
         """
+    html += "</div>"
     return html
+def generate_solution_streaming(prompt, temperature=0.6, max_tokens=8192, max_thinking_tokens=4096):
+    """Generate solution with streaming"""
     if not prompt.strip():
+        yield "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
+        return
+    # Reset parser
+    parser.reset()
+    # Stream generation
+    for generated_text, stats, loop_detected in vibe_model.generate_response_streaming(
         prompt,
         temperature=temperature,
         max_new_tokens=max_tokens,
         max_thinking_tokens=max_thinking_tokens
+    ):
+        if stats:
+            is_generating = not stats.get('is_complete', False)
+            html_output = format_streaming_html(generated_text, stats, loop_detected, is_generating)
+            yield html_output
+        else:
+            yield f"<p style='color: #dc3545;'>Error: {generated_text}</p>"
 # Create Gradio interface
 with gr.Blocks(
     🎯 **Best for**: Python algorithmic problems with clear input/output specifications
     ⚠️ **Note**: This model is specialized for competitive programming, not general software development
+    ✨ **Features**: Real-time streaming with intelligent section parsing and automatic loop detection
     """)
     with gr.Row():
                 - Higher thinking tokens (4096-8192) for complex reasoning
                 - Temperature 0.6 balances creativity and accuracy
+                **Real-time Features:**
+                - 🔄 Live token-by-token streaming
+                - 🧠 Intelligent section parsing (thinking/explanation/code)
+                - ⚠️ Automatic loop detection (stops if repetitive patterns detected)
+                - ⚡ Blinking cursors on actively streaming sections
+                - 📊 Live statistics (time, speed, tokens)
+                **Loop Detection:**
+                - Monitors for 3-9 word phrases repeated 5+ times
+                - Automatically stops generation to save tokens
+                - Truncates at loop start position
                 """)
             generate_btn = gr.Button("🚀 Generate Solution", variant="primary", size="lg")
             output_html = gr.HTML(label="Solution")
     generate_btn.click(
+        fn=generate_solution_streaming,
         inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
         outputs=output_html
     )