import gradio as gr from PIL import Image import base64 import io import re from llms import LLM from typing import Union # Helper: Convert PIL Image to base64 string def image_to_base64(image: Image.Image) -> str: buffered = io.BytesIO() image.save(buffered, format="PNG") img_bytes = buffered.getvalue() img_b64 = base64.b64encode(img_bytes).decode("utf-8") return img_b64 # Helper: Extract LaTeX from LLM output def extract_latex(text): # Look for ```latex ... ``` or ``` ... ``` code_block = re.search(r"```latex\s*([\s\S]+?)```", text) if not code_block: code_block = re.search(r"```\s*([\s\S]+?)```", text) if code_block: return code_block.group(1).strip() # Look for $...$ inline dollar_block = re.search(r"\$(.+?)\$", text) if dollar_block: return dollar_block.group(1).strip() # Fallback: return everything stripped return text.strip() # Multimodal prompt template (LangChain style) MULTIMODAL_PROMPT = ( "You are an expert at recognizing mathematical equations from images. " "Given the following image (base64 encoded PNG), extract the LaTeX code for any equation present. " "If there is no equation, say 'No equation found.'\n" "Image (base64):\n{image_b64}" ) def image_to_latex_llm(image: Image.Image, model_name: str): if image is None: return "Please upload an image." buffered = io.BytesIO() image.save(buffered, format="PNG") img_bytes = buffered.getvalue() img_b64 = base64.b64encode(img_bytes).decode("utf-8") # Log base64 string (shortened) # print(f"[LOG] Image base64 (first 60 chars): {img_b64[:60]}...") prompt = [ { "type": "text", "text": ( "You are an expert at recognizing mathematical equations from images. " "Given the following image, extract the LaTeX code for any equation present. " "If there is no equation, say 'No equation found.' " "Return only the LaTeX code, and wrap it in a markdown code block with the word 'latex', like this: ```latex ... ``` ." ), }, { "type": "image_url", "image_url": f"data:image/png;base64,{img_b64}", }, ] message = {"role": "user", "content": prompt} llm = LLM(model=model_name) response = llm.chat_model.invoke([message]) print(f"[LOG] Raw LLM output: {response.content}") latex_raw = response.content latex = extract_latex(latex_raw) print(f"[LOG] Extracted LaTeX: {latex}") return latex import requests import logging import os BACKEND_API_ENDPOINT = os.getenv("BACKEND_API_ENDPOINT") REQUEST_TIMEOUT = 300 def call_latex_ocr_inference(payload: dict) -> Union[dict, str]: """ Sends a POST request to the Latex OCR Modal endpoint with given payload. """ endpoint = BACKEND_API_ENDPOINT try: resp = requests.post( endpoint, json=payload, timeout=REQUEST_TIMEOUT ) resp.raise_for_status() return resp.json() except requests.exceptions.RequestException as e: logging.error(f"Error calling API: {str(e)}") return {"error": str(e)} def image_to_latex_hf(image: Image.Image,model_name): if image is None: return "Please upload an image." buffered = io.BytesIO() image.save(buffered, format="PNG") img_bytes = buffered.getvalue() img_b64 = base64.b64encode(img_bytes).decode("utf-8") payload = { "image_base64": img_b64, "text_input": "", "model_name": model_name } response = call_latex_ocr_inference(payload) print(f"[LOG] Raw response: {response}") latex_equation = response['result'] return latex_equation def image_to_latex(image: Image.Image, model_name): print("model_name", model_name) if "gemini" in model_name or "openai" in model_name or "claude" in model_name: latex_equation = image_to_latex_llm(image, model_name) else: latex_equation = image_to_latex_hf(image, model_name) rendered_latex_equation = "$$" + latex_equation + "$$" return latex_equation, rendered_latex_equation with gr.Blocks() as demo: gr.HTML("""
Image to LaTeX
Converter

Upload an image containing a math equation and get the LaTeX code instantly.
""") image_input = gr.Image(type="pil", label="Upload Image", elem_id="image-input") model = gr.Dropdown( [ "gemini-2.0-flash", # "claude-3-5-sonnet", # "gpt-4o", "qwen/qwen2.5-vl-3b-instruct", "google/gemma-3-4b-it" ], value="gemini-2.0-flash", label="Model Options", elem_id="model-dropdown" ) btn = gr.Button("Convert to LaTeX", elem_id="convert-button") output = gr.Textbox(label="LaTeX Equation", elem_id="output") rendered_output = gr.Markdown(label="Rendered Equation") btn.click( fn=image_to_latex, inputs=[image_input, model], outputs=[output, rendered_output] ) gr.HTML("""
""") gr.HTML("""""") gr.Examples( examples=[ ["examples/ex01.jpg"], ["examples/ex02.jpg"], ["examples/ex03.jpg"], ["examples/ex04.jpg"], ], inputs=[image_input], label="Try Examples", cache_examples=False, elem_id="examples" ) demo.launch(share=True) # def main(): # demo.launch() # if __name__ == "__main__": # main()