Spaces:

lapa-llm
/

lapa

Running on Zero

Vladyslav Humennyy Claude commited on Oct 3

Commit

a113c8a

1 Parent(s): 9203469

Rewrite image handling to match app_chat_vllm.py format

- User function now converts images to base64 with image_url format
- Removed complex unused helper functions for message processing
- Bot function properly handles base64 images with processor
- Converts base64 back to PIL images when using processor
- Falls back to tokenizer for text-only messages
- Simplified and cleaner implementation matching app_chat_vllm.py

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +112 -258

app.py CHANGED Viewed

@@ -60,44 +60,40 @@ def load_model():
 model, tokenizer, processor, device = load_model()
-def _ensure_image_path(image_data: Any) -> str | None:
-    """Return a valid file path for the provided image data."""
-    if image_data is None:
-        return None
-    try:
-        from PIL import Image
-    except ImportError:  # pragma: no cover - PIL is bundled with Gradio's image component
-        return None
-    # Already a path string
-    if isinstance(image_data, str) and os.path.exists(image_data):
-        return image_data
-    # PIL Image object - save to temp file
-    if isinstance(image_data, Image.Image):
-        fd, tmp_path = tempfile.mkstemp(suffix=".png")
-        os.close(fd)
-        image_data.save(tmp_path, format="PNG")
-        return tmp_path
-    return None
 def user(user_message, image_data, history: list):
-    user_message = user_message or ""
     updated_history = list(history)
     has_content = False
     stripped_message = user_message.strip()
-    if stripped_message:
-        updated_history.append({"role": "user", "content": stripped_message})
-        has_content = True
-    image_path = _ensure_image_path(image_data)
-    if image_path is not None:
-        updated_history.append({"role": "user", "content": {"path": image_path, "alt_text": "User uploaded image"}})
         has_content = True
     if not has_content:
@@ -117,257 +113,116 @@ def append_example_message(x: gr.SelectData, history):
     return history
-def _message_contains_image(message: dict[str, Any]) -> bool:
-    content = message.get("content")
-    if isinstance(content, dict):
-        if "path" in content or "image" in content:
-            return True
-        if content.get("type") in {"image", "image_url"}:
-            return True
     if isinstance(content, list):
         for item in content:
-            if isinstance(item, dict) and item.get("type") in {"image", "image_url"}:
-                return True
-    return False
-def _content_to_text(content: Any) -> str:
-    if isinstance(content, dict):
-        if "text" in content:
-            return content.get("text", "")
-        if "path" in content:
-            alt_text = content.get("alt_text")
-            placeholder = alt_text or os.path.basename(content["path"]) or "image"
-            return f"[image: {placeholder}]"
-        if "image" in content:
-            return "[image]"
-        if content.get("type") == "image_url":
-            image_url = content.get("image_url")
-            if isinstance(image_url, dict):
-                image_url = image_url.get("url", "")
-            return f"[image: {image_url}]"
-        if content.get("type") == "text":
-            return content.get("text", "")
-        return str(content)
-    if isinstance(content, list):
-        text_parts: list[str] = []
-        for item in content:
-            if isinstance(item, dict):
-                item_type = item.get("type")
-                if item_type == "text":
-                    text_parts.append(item.get("text", ""))
-                elif item_type == "image":
-                    text_parts.append("[image]")
-                elif item_type == "image_url":
-                    image_url = item.get("image_url")
-                    if isinstance(image_url, dict):
-                        image_url = image_url.get("url", "")
-                    text_parts.append(f"[image: {image_url}]")
-                else:
-                    text_parts.append(str(item))
-            else:
-                text_parts.append(str(item))
-        filtered = [part for part in text_parts if part]
-        return "\n".join(filtered) if filtered else "[image]"
     return str(content)
-def _collect_recent_user_contents(history: list[dict[str, Any]]) -> list[Any]:
-    """Collect the trailing sequence of user messages prior to the assistant reply."""
-    chunks: list[Any] = []
-    for message in reversed(history):
-        if message.get("role") != "user":
-            break
-        chunks.append(message.get("content"))
-    chunks.reverse()
-    return chunks
-def _prepare_text_history(history: list[dict[str, Any]]) -> list[dict[str, str]]:
-    text_history: list[dict[str, str]] = []
-    for message in history:
-        role = message.get("role", "user")
-        content_text = _content_to_text(message.get("content"))
-        if not content_text:
-            continue
-        if text_history and text_history[-1]["role"] == role:
-            text_history[-1]["content"] = text_history[-1]["content"] + "\n" + content_text
-        else:
-            text_history.append({"role": role, "content": content_text})
-    return text_history
-def _prepare_processor_history(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    """Prepare history for processor with proper image format."""
-    processor_history = []
-    for message in history:
-        role = message.get("role", "user")
-        content = message.get("content")
-        # Handle different content formats
-        if isinstance(content, str):
-            # Simple text message
-            processor_history.append({"role": role, "content": content})
-        elif isinstance(content, list):
-            # Multi-modal content (text + images)
-            formatted_content = []
-            for item in content:
-                if isinstance(item, dict):
-                    item_type = item.get("type")
-                    if item_type == "text":
-                        formatted_content.append({"type": "text", "text": item.get("text", "")})
-                    elif item_type == "image":
-                        # Extract PIL Image from _pil_image field or load from path
-                        pil_image = item.get("_pil_image")
-                        if pil_image is None and "path" in item:
-                            from PIL import Image
-                            pil_image = Image.open(item["path"])
-                        if pil_image is not None:
-                            formatted_content.append({"type": "image", "image": pil_image})
-            if formatted_content:
-                processor_history.append({"role": role, "content": formatted_content})
-        elif isinstance(content, dict):
-            # Legacy format or single image
-            if "image" in content or "_pil_image" in content:
-                pil_image = content.get("_pil_image") or content.get("image")
-                if pil_image is None and "path" in content:
-                    from PIL import Image
-                    pil_image = Image.open(content["path"])
-                if pil_image is not None:
-                    processor_history.append({
-                        "role": role,
-                        "content": [{"type": "image", "image": pil_image}]
-                    })
-            else:
-                # Try to extract text
-                text = _content_to_text(content)
-                if text:
-                    processor_history.append({"role": role, "content": text})
-    return processor_history
-def _clean_history_for_display(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    """Remove internal metadata fields like _pil_image before displaying in Gradio."""
-    cleaned = []
-    for message in history:
-        cleaned_message = {"role": message.get("role", "user")}
-        content = message.get("content")
-        if isinstance(content, str):
-            cleaned_message["content"] = content
-        elif isinstance(content, list):
-            cleaned_content = []
-            for item in content:
-                if isinstance(item, dict):
-                    # Remove _pil_image and ensure alt_text is string or absent
-                    cleaned_item = {}
-                    for k, v in item.items():
-                        if k == "_pil_image":
-                            continue
-                        if k == "alt_text":
-                            # Ensure alt_text is a string
-                            if isinstance(v, str):
-                                cleaned_item[k] = v
-                            # Skip non-string alt_text values
-                            continue
-                        cleaned_item[k] = v
-                    # Ensure alt_text exists for image type
-                    if cleaned_item.get("type") == "image" and "alt_text" not in cleaned_item:
-                        cleaned_item["alt_text"] = "uploaded image"
-                    cleaned_content.append(cleaned_item)
-                else:
-                    cleaned_content.append(item)
-            cleaned_message["content"] = cleaned_content
-        elif isinstance(content, dict):
-            # Remove _pil_image and ensure alt_text is string or absent
-            cleaned_item = {}
-            for k, v in content.items():
-                if k == "_pil_image":
-                    continue
-                if k == "alt_text":
-                    # Ensure alt_text is a string
-                    if isinstance(v, str):
-                        cleaned_item[k] = v
-                    # Skip non-string alt_text values
-                    continue
-                cleaned_item[k] = v
-            # Ensure alt_text exists for image content
-            if "path" in cleaned_item and "alt_text" not in cleaned_item:
-                cleaned_item["alt_text"] = "uploaded image"
-            cleaned_message["content"] = cleaned_item
-        else:
-            cleaned_message["content"] = content
-        cleaned.append(cleaned_message)
-    return cleaned
 @spaces.GPU
 def bot(
     history: list[dict[str, Any]]
-    # max_tokens,
-    # temperature,
-    # top_p,
 ):
-    user_chunks = _collect_recent_user_contents(history)
-    if not user_chunks:
-        user_message_text = ""
-    else:
-        user_message_text = "\n".join(filter(None, (_content_to_text(chunk) for chunk in user_chunks)))
-    print('User message:', user_message_text)
-    # [{"role": "system", "content": system_message}] +
-    # Build conversation
     max_tokens = 4096
     temperature = 0.7
     top_p = 0.95
-    text_history = _prepare_text_history(history)
-    # Handle empty history case
-    if not text_history:
-        input_text = ""
-    else:
-        input_text: str = tokenizer.apply_chat_template(
-            text_history,
-            tokenize=False,
-            add_generation_prompt=True,
-            # enable_thinking=True,
-        )
-    if input_text and tokenizer.bos_token:
-        input_text = input_text.replace(tokenizer.bos_token, "", 1)
-    print(input_text)
-    model_inputs = None
     # Early return if no input
-    if not input_text and not any(_message_contains_image(msg) for msg in history):
         return
-    if processor is not None and any(_message_contains_image(msg) for msg in history):
         try:
-            processor_history = _prepare_processor_history(history)
             model_inputs = processor(
                 messages=processor_history,
                 return_tensors="pt",
                 add_generation_prompt=True,
             ).to(model.device)
-        except Exception as exc:  # pragma: no cover - diagnostic logging
-            print(f"Processor failed, using tokenizer pipeline instead: {exc}")
     if model_inputs is None:
-        model_inputs = tokenizer(input_text, return_tensors="pt").to(model.device)  # .to(device)
-    decoded_input = tokenizer.decode(model_inputs["input_ids"][0])
-    print("Decoded input:", decoded_input)
-    print([{int(token_id.item()): tokenizer.decode([int(token_id.item())])} for token_id in model_inputs["input_ids"][0]])
     # Streamer setup
-    streamer = TextIteratorStreamer(
-        tokenizer, skip_prompt=True  # skip_special_tokens=True  # ,
-    )
     # Run model.generate in background thread
     generation_kwargs = dict(
@@ -377,7 +232,6 @@ def bot(
         top_p=top_p,
         top_k=64,
         do_sample=True,
-        # eos_token_id=tokenizer.eos_token_id,
         streamer=streamer,
     )
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
@@ -387,7 +241,7 @@ def bot(
     # Yield tokens as they come in
     for new_text in streamer:
         history[-1]["content"] += new_text
-        yield _clean_history_for_display(history)
     assistant_message = history[-1]["content"]
     logger.log_interaction(user=user_message_text, answer=assistant_message)

 model, tokenizer, processor, device = load_model()
 def user(user_message, image_data, history: list):
+    """Format user message with optional image (like app_chat_vllm.py)."""
+    import base64
+    import io
+    from PIL import Image
+    user_message = user_message or ""
     updated_history = list(history)
     has_content = False
     stripped_message = user_message.strip()
+    # Format message with image in base64 format (matching app_chat_vllm.py)
+    if image_data is not None:
+        # Convert PIL image to base64
+        buffered = io.BytesIO()
+        image_data.save(buffered, format="JPEG")
+        img_base64 = base64.b64encode(buffered.getvalue()).decode()
+        text_content = stripped_message if stripped_message else "Describe this image"
+        updated_history.append({
+            "role": "user",
+            "content": [
+                {"type": "text", "text": text_content},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        })
+        has_content = True
+    elif stripped_message:
+        updated_history.append({"role": "user", "content": stripped_message})
         has_content = True
     if not has_content:
     return history
+def _extract_text_from_content(content: Any) -> str:
+    """Extract text from message content for logging."""
+    if isinstance(content, str):
+        return content
     if isinstance(content, list):
+        text_parts = []
         for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                text_parts.append(item.get("text", ""))
+        return " ".join(text_parts) if text_parts else "[Image]"
     return str(content)
 @spaces.GPU
 def bot(
     history: list[dict[str, Any]]
 ):
+    """Generate bot response with support for text and images."""
     max_tokens = 4096
     temperature = 0.7
     top_p = 0.95
     # Early return if no input
+    if not history:
         return
+    # Extract last user message for logging
+    last_user_msg = next((msg for msg in reversed(history) if msg.get("role") == "user"), None)
+    user_message_text = _extract_text_from_content(last_user_msg.get("content")) if last_user_msg else ""
+    print('User message:', user_message_text)
+    # Check if any message contains images
+    has_images = any(
+        isinstance(msg.get("content"), list) and
+        any(item.get("type") == "image_url" for item in msg.get("content") if isinstance(item, dict))
+        for msg in history
+    )
+    model_inputs = None
+    # Use processor if images are present
+    if processor is not None and has_images:
         try:
+            # Processor expects messages with PIL images, not base64
+            # We need to convert base64 back to PIL for the processor
+            from PIL import Image
+            import base64
+            import io
+            processor_history = []
+            for msg in history:
+                role = msg.get("role", "user")
+                content = msg.get("content")
+                if isinstance(content, str):
+                    processor_history.append({"role": role, "content": content})
+                elif isinstance(content, list):
+                    formatted_content = []
+                    for item in content:
+                        if isinstance(item, dict):
+                            if item.get("type") == "text":
+                                formatted_content.append({"type": "text", "text": item.get("text", "")})
+                            elif item.get("type") == "image_url":
+                                # Extract base64 and convert to PIL
+                                img_url = item.get("image_url", {}).get("url", "")
+                                if img_url.startswith("data:image"):
+                                    base64_data = img_url.split(",")[1]
+                                    img_data = base64.b64decode(base64_data)
+                                    pil_image = Image.open(io.BytesIO(img_data))
+                                    formatted_content.append({"type": "image", "image": pil_image})
+                    if formatted_content:
+                        processor_history.append({"role": role, "content": formatted_content})
             model_inputs = processor(
                 messages=processor_history,
                 return_tensors="pt",
                 add_generation_prompt=True,
             ).to(model.device)
+            print("Using processor for vision input")
+        except Exception as exc:
+            print(f"Processor failed: {exc}")
+            model_inputs = None
+    # Fallback to tokenizer for text-only
+    if model_inputs is None:
+        # Convert to text-only format for tokenizer
+        text_history = []
+        for msg in history:
+            role = msg.get("role", "user")
+            content = msg.get("content")
+            text_content = _extract_text_from_content(content)
+            if text_content:
+                text_history.append({"role": role, "content": text_content})
+        if text_history:
+            input_text = tokenizer.apply_chat_template(
+                text_history,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            if input_text and tokenizer.bos_token:
+                input_text = input_text.replace(tokenizer.bos_token, "", 1)
+            model_inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+            print("Using tokenizer for text-only input")
     if model_inputs is None:
+        return
     # Streamer setup
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     # Run model.generate in background thread
     generation_kwargs = dict(
         top_p=top_p,
         top_k=64,
         do_sample=True,
         streamer=streamer,
     )
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     # Yield tokens as they come in
     for new_text in streamer:
         history[-1]["content"] += new_text
+        yield history
     assistant_message = history[-1]["content"]
     logger.log_interaction(user=user_message_text, answer=assistant_message)