Spaces:

namfam
/

image2latex

Sleeping

Nam Fam

update files

09c99ba 9 months ago

7 kB

	import gradio as gr
	from PIL import Image
	import base64
	import io
	import re
	from llms import LLM
	from typing import Union

	# Helper: Convert PIL Image to base64 string
	def image_to_base64(image: Image.Image) -> str:
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	img_bytes = buffered.getvalue()
	img_b64 = base64.b64encode(img_bytes).decode("utf-8")
	return img_b64

	# Helper: Extract LaTeX from LLM output
	def extract_latex(text):
	# Look for ```latex ... ``` or ``` ... ```
	code_block = re.search(r"```latex\s*([\s\S]+?)```", text)
	if not code_block:
	code_block = re.search(r"```\s*([\s\S]+?)```", text)
	if code_block:
	return code_block.group(1).strip()
	# Look for $...$ inline
	dollar_block = re.search(r"\$(.+?)\$", text)
	if dollar_block:
	return dollar_block.group(1).strip()
	# Fallback: return everything stripped
	return text.strip()

	# Multimodal prompt template (LangChain style)
	MULTIMODAL_PROMPT = (
	"You are an expert at recognizing mathematical equations from images. "
	"Given the following image (base64 encoded PNG), extract the LaTeX code for any equation present. "
	"If there is no equation, say 'No equation found.'\n"
	"Image (base64):\n{image_b64}"
	)

	def image_to_latex_llm(image: Image.Image, model_name: str):
	if image is None:
	return "Please upload an image."
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	img_bytes = buffered.getvalue()
	img_b64 = base64.b64encode(img_bytes).decode("utf-8")

	# Log base64 string (shortened)
	# print(f"[LOG] Image base64 (first 60 chars): {img_b64[:60]}...")

	prompt = [
	{
	"type": "text",
	"text": (
	"You are an expert at recognizing mathematical equations from images. "
	"Given the following image, extract the LaTeX code for any equation present. "
	"If there is no equation, say 'No equation found.' "
	"Return only the LaTeX code, and wrap it in a markdown code block with the word 'latex', like this: ```latex ... ``` ."
	),
	},
	{
	"type": "image_url",
	"image_url": f"data:image/png;base64,{img_b64}",
	},
	]
	message = {"role": "user", "content": prompt}
	llm = LLM(model=model_name)
	response = llm.chat_model.invoke([message])
	print(f"[LOG] Raw LLM output: {response.content}")
	latex_raw = response.content
	latex = extract_latex(latex_raw)
	print(f"[LOG] Extracted LaTeX: {latex}")
	return latex


	import requests
	import logging
	import os

	BACKEND_API_ENDPOINT = os.getenv("BACKEND_API_ENDPOINT")
	REQUEST_TIMEOUT = 300



	def call_latex_ocr_inference(payload: dict) -> Union[dict, str]:
	"""
	Sends a POST request to the Latex OCR Modal endpoint with given payload.
	"""
	endpoint = BACKEND_API_ENDPOINT
	try:
	resp = requests.post(
	endpoint,
	json=payload,
	timeout=REQUEST_TIMEOUT
	)
	resp.raise_for_status()
	return resp.json()
	except requests.exceptions.RequestException as e:
	logging.error(f"Error calling API: {str(e)}")
	return {"error": str(e)}

	def image_to_latex_hf(image: Image.Image,model_name):
	if image is None:
	return "Please upload an image."
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	img_bytes = buffered.getvalue()
	img_b64 = base64.b64encode(img_bytes).decode("utf-8")

	payload = {
	"image_base64": img_b64,
	"text_input": "",
	"model_name": model_name
	}
	response = call_latex_ocr_inference(payload)
	print(f"[LOG] Raw response: {response}")
	latex_equation = response['result']

	return latex_equation



	def image_to_latex(image: Image.Image, model_name):
	print("model_name", model_name)
	if "gemini" in model_name or "openai" in model_name or "claude" in model_name:
	latex_equation = image_to_latex_llm(image, model_name)
	else:
	latex_equation = image_to_latex_hf(image, model_name)

	rendered_latex_equation = "$$" + latex_equation + "$$"
	return latex_equation, rendered_latex_equation

	with gr.Blocks() as demo:
	gr.HTML("""
	<div style='text-align: center; margin-top: 40px;'>
	<span style='font-size: 3em; font-family: serif; font-weight: 400;'>Image to LaTeX</span><br>
	<span style='font-size: 5em; font-family: serif; color: #1784d6; font-weight: 600;'>Converter</span><br><br>
	<span style='font-size: 1.2em; font-family: sans-serif;'>
	Upload an image containing a math equation and get the <b>LaTeX</b> code instantly.<br>
	<!-- Powered by <span style='color:#1784d6;'>AI</span> and LLMs. -->
	</span>
	</div>
	<!-- <hr style='margin: 32px 0 32px 0'> -->
	<div style='max-width:420px;margin:0 auto;'>
	""")


	image_input = gr.Image(type="pil", label="Upload Image", elem_id="image-input")

	model = gr.Dropdown(
	[
	"gemini-2.0-flash",
	# "claude-3-5-sonnet",
	# "gpt-4o",
	"qwen/qwen2.5-vl-3b-instruct",
	"google/gemma-3-4b-it"
	],
	value="gemini-2.0-flash",
	label="Model Options",
	elem_id="model-dropdown"
	)
	btn = gr.Button("Convert to LaTeX", elem_id="convert-button")



	output = gr.Textbox(label="LaTeX Equation", elem_id="output")
	rendered_output = gr.Markdown(label="Rendered Equation")

	btn.click(
	fn=image_to_latex,
	inputs=[image_input, model],
	outputs=[output, rendered_output]
	)


	gr.HTML("""
	</div>
	<style>

	#image-input {
	max-width: 60% !important;
	margin: 10px auto !important;
	}

	#output {
	max-width: 100% !important;
	margin: 10px auto !important;
	}

	#convert-button {
	width: 160px !important;
	margin: 10px auto !important;
	}

	#examples {
	max-width: 600px !important;
	margin: 10px auto !important;

	}

	#examples img {
	height: 50px;
	width: auto;
	object-fit: contain;
	object-position: center;

	}

	.form {
	max-width: 60% !important;
	margin: 10px auto !important;
	background: transparent !important;
	box-shadow: none !important;
	}


	}

	</style>
	""")
	gr.HTML("""</div>""")

	gr.Examples(
	examples=[
	["examples/ex01.jpg"],
	["examples/ex02.jpg"],
	["examples/ex03.jpg"],
	["examples/ex04.jpg"],
	],
	inputs=[image_input],
	label="Try Examples",
	cache_examples=False,
	elem_id="examples"
	)


	demo.launch(share=True)


	# def main():
	# demo.launch()

	# if __name__ == "__main__":
	# main()