image2latex / app.py
Nam Fam
update files
09c99ba
import gradio as gr
from PIL import Image
import base64
import io
import re
from llms import LLM
from typing import Union
# Helper: Convert PIL Image to base64 string
def image_to_base64(image: Image.Image) -> str:
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
return img_b64
# Helper: Extract LaTeX from LLM output
def extract_latex(text):
# Look for ```latex ... ``` or ``` ... ```
code_block = re.search(r"```latex\s*([\s\S]+?)```", text)
if not code_block:
code_block = re.search(r"```\s*([\s\S]+?)```", text)
if code_block:
return code_block.group(1).strip()
# Look for $...$ inline
dollar_block = re.search(r"\$(.+?)\$", text)
if dollar_block:
return dollar_block.group(1).strip()
# Fallback: return everything stripped
return text.strip()
# Multimodal prompt template (LangChain style)
MULTIMODAL_PROMPT = (
"You are an expert at recognizing mathematical equations from images. "
"Given the following image (base64 encoded PNG), extract the LaTeX code for any equation present. "
"If there is no equation, say 'No equation found.'\n"
"Image (base64):\n{image_b64}"
)
def image_to_latex_llm(image: Image.Image, model_name: str):
if image is None:
return "Please upload an image."
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
# Log base64 string (shortened)
# print(f"[LOG] Image base64 (first 60 chars): {img_b64[:60]}...")
prompt = [
{
"type": "text",
"text": (
"You are an expert at recognizing mathematical equations from images. "
"Given the following image, extract the LaTeX code for any equation present. "
"If there is no equation, say 'No equation found.' "
"Return only the LaTeX code, and wrap it in a markdown code block with the word 'latex', like this: ```latex ... ``` ."
),
},
{
"type": "image_url",
"image_url": f"data:image/png;base64,{img_b64}",
},
]
message = {"role": "user", "content": prompt}
llm = LLM(model=model_name)
response = llm.chat_model.invoke([message])
print(f"[LOG] Raw LLM output: {response.content}")
latex_raw = response.content
latex = extract_latex(latex_raw)
print(f"[LOG] Extracted LaTeX: {latex}")
return latex
import requests
import logging
import os
BACKEND_API_ENDPOINT = os.getenv("BACKEND_API_ENDPOINT")
REQUEST_TIMEOUT = 300
def call_latex_ocr_inference(payload: dict) -> Union[dict, str]:
"""
Sends a POST request to the Latex OCR Modal endpoint with given payload.
"""
endpoint = BACKEND_API_ENDPOINT
try:
resp = requests.post(
endpoint,
json=payload,
timeout=REQUEST_TIMEOUT
)
resp.raise_for_status()
return resp.json()
except requests.exceptions.RequestException as e:
logging.error(f"Error calling API: {str(e)}")
return {"error": str(e)}
def image_to_latex_hf(image: Image.Image,model_name):
if image is None:
return "Please upload an image."
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
payload = {
"image_base64": img_b64,
"text_input": "",
"model_name": model_name
}
response = call_latex_ocr_inference(payload)
print(f"[LOG] Raw response: {response}")
latex_equation = response['result']
return latex_equation
def image_to_latex(image: Image.Image, model_name):
print("model_name", model_name)
if "gemini" in model_name or "openai" in model_name or "claude" in model_name:
latex_equation = image_to_latex_llm(image, model_name)
else:
latex_equation = image_to_latex_hf(image, model_name)
rendered_latex_equation = "$$" + latex_equation + "$$"
return latex_equation, rendered_latex_equation
with gr.Blocks() as demo:
gr.HTML("""
<div style='text-align: center; margin-top: 40px;'>
<span style='font-size: 3em; font-family: serif; font-weight: 400;'>Image to LaTeX</span><br>
<span style='font-size: 5em; font-family: serif; color: #1784d6; font-weight: 600;'>Converter</span><br><br>
<span style='font-size: 1.2em; font-family: sans-serif;'>
Upload an image containing a math equation and get the <b>LaTeX</b> code instantly.<br>
<!-- Powered by <span style='color:#1784d6;'>AI</span> and LLMs. -->
</span>
</div>
<!-- <hr style='margin: 32px 0 32px 0'> -->
<div style='max-width:420px;margin:0 auto;'>
""")
image_input = gr.Image(type="pil", label="Upload Image", elem_id="image-input")
model = gr.Dropdown(
[
"gemini-2.0-flash",
# "claude-3-5-sonnet",
# "gpt-4o",
"qwen/qwen2.5-vl-3b-instruct",
"google/gemma-3-4b-it"
],
value="gemini-2.0-flash",
label="Model Options",
elem_id="model-dropdown"
)
btn = gr.Button("Convert to LaTeX", elem_id="convert-button")
output = gr.Textbox(label="LaTeX Equation", elem_id="output")
rendered_output = gr.Markdown(label="Rendered Equation")
btn.click(
fn=image_to_latex,
inputs=[image_input, model],
outputs=[output, rendered_output]
)
gr.HTML("""
</div>
<style>
#image-input {
max-width: 60% !important;
margin: 10px auto !important;
}
#output {
max-width: 100% !important;
margin: 10px auto !important;
}
#convert-button {
width: 160px !important;
margin: 10px auto !important;
}
#examples {
max-width: 600px !important;
margin: 10px auto !important;
}
#examples img {
height: 50px;
width: auto;
object-fit: contain;
object-position: center;
}
.form {
max-width: 60% !important;
margin: 10px auto !important;
background: transparent !important;
box-shadow: none !important;
}
}
</style>
""")
gr.HTML("""</div>""")
gr.Examples(
examples=[
["examples/ex01.jpg"],
["examples/ex02.jpg"],
["examples/ex03.jpg"],
["examples/ex04.jpg"],
],
inputs=[image_input],
label="Try Examples",
cache_examples=False,
elem_id="examples"
)
demo.launch(share=True)
# def main():
# demo.launch()
# if __name__ == "__main__":
# main()