LightOnOCR

Paused

App Files Files Community

IFMedTechdemo commited on 19 days ago

Commit

e22346a

verified ·

1 Parent(s): 5ea9a7f

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -215

app.py CHANGED Viewed

@@ -1,120 +1,21 @@
 #################################################################################################
-import subprocess
-import sys
 import spaces
-import torch
 import gradio as gr
 from PIL import Image
 import numpy as np
 import cv2
-import pypdfium2 as pdfium
-from transformers import (
-    LightOnOCRForConditionalGeneration,
-    LightOnOCRProcessor,
-)
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import re
-device = "cuda" if torch.cuda.is_available() else "cpu"
-if device == "cuda":
-    attn_implementation = "sdpa"
-    dtype = torch.bfloat16
-else:
-    attn_implementation = "eager"
-    dtype = torch.float32
-ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
-    "lightonai/LightOnOCR-1B-1025",
-    attn_implementation=attn_implementation,
-    torch_dtype=dtype,
-    trust_remote_code=True,
-).to(device).eval()
-processor = LightOnOCRProcessor.from_pretrained(
-    "lightonai/LightOnOCR-1B-1025",
-    trust_remote_code=True,
-)
-ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
-ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
-ner_pipeline = pipeline(
-    "ner",
-    model=ner_model,
-    tokenizer=ner_tokenizer,
-    aggregation_strategy="simple",
-)
-def render_pdf_page(page, max_resolution=1540, scale=2.77):
-    width, height = page.get_size()
-    pixel_width = width * scale
-    pixel_height = height * scale
-    resize_factor = min(1, max_resolution / pixel_width, max_resolution / pixel_height)
-    target_scale = scale * resize_factor
-    return page.render(scale=target_scale, rev_byteorder=True).to_pil()
-def process_pdf(pdf_path, page_num=1):
-    pdf = pdfium.PdfDocument(pdf_path)
-    total_pages = len(pdf)
-    page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
-    page = pdf[page_idx]
-    img = render_pdf_page(page)
-    pdf.close()
-    return img, total_pages, page_idx + 1
-def clean_output_text(text):
-    markers_to_remove = ["system", "user", "assistant"]
-    lines = text.split('\n')
-    cleaned_lines = []
-    for line in lines:
-        stripped = line.strip()
-        if stripped.lower() not in markers_to_remove:
-            cleaned_lines.append(line)
-    cleaned = '\n'.join(cleaned_lines).strip()
-    if "assistant" in text.lower():
-        parts = text.split("assistant", 1)
-        if len(parts) > 1:
-            cleaned = parts[1].strip()
-    return cleaned
-def preprocess_image_for_ocr(image):
-    image_rgb = image.convert("RGB")
-    img_np = np.array(image_rgb)
-    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
-    adaptive_threshold = cv2.adaptiveThreshold(
-        gray,
-        255,
-        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-        cv2.THRESH_BINARY,
-        85,
-        35,
-    )
-    preprocessed_pil = Image.fromarray(adaptive_threshold)
-    return preprocessed_pil
 def extract_medication_lines(text):
-    """
-    Extracts medication/drug lines from text using regex.
-    Matches lines beginning with tab, tablet, cap, capsule, syrup, syp, oral, inj, injection, ointment, drops, patch, sol, solution, etc.
-    Handles case-insensitivity and abbreviations like T., C., tab., cap. etc.
-    """
-    # "|" means OR. (?:...) is a non-capturing group.
-    pattern = r"""^\s*    # Leading spaces allowed
-    (
-        T\.?|TAB\.?|TABLET      # T., T, TAB, TAB., TABLET
-        |C\.?|CAP\.?|CAPSULE    # C., C, CAP, CAP., CAPSULE
         |SYRUP|SYP
         |ORAL
-        |INJ\.?|INJECTION       # INJ., INJ, INJECTION
         |OINTMENT|DROPS|PATCH|SOL\.?|SOLUTION
-    )
-    \s+[A-Z0-9 \-\(\)/,.]+      # Name/dose/other info (at least one space/letter after the pattern)
-    """
-    # Compile with re.IGNORECASE and re.VERBOSE for readability
     med_regex = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
     meds = []
     for line in text.split('\n'):
@@ -123,30 +24,51 @@ def extract_medication_lines(text):
             meds.append(line)
     return '\n'.join(meds)
-def extract_meds(text, use_ner):
-    """
-    Switches between Clinical NER or regex extraction.
-    Returns medications string.
-    """
     if use_ner:
-        entities = ner_pipeline(text)
-        meds = []
-        for ent in entities:
-            if ent["entity_group"] == "treatment":
-                word = ent["word"]
-                if word.startswith("##") and meds:
-                    meds[-1] += word[2:]
-                else:
-                    meds.append(word)
-        return ", ".join(set(meds)) if meds else "None detected"
-    else:
-        return extract_medication_lines(text) or "None detected"
-@spaces.GPU
-def extract_text_from_image(image, temperature=0.2):
-    """OCR with adaptive thresholding."""
     processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
             "role": "user",
@@ -162,15 +84,13 @@ def extract_text_from_image(image, temperature=0.2):
         return_dict=True,
         return_tensors="pt",
     )
-    # Move inputs to device
     inputs = {
-        k: (
-            v.to(device=device, dtype=dtype)
             if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
             else v.to(device)
             if isinstance(v, torch.Tensor)
-            else v
-        )
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
@@ -182,68 +102,40 @@ def extract_text_from_image(image, temperature=0.2):
     )
     with torch.no_grad():
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
-    cleaned_text = clean_output_text(output_text)
-    yield cleaned_text, output_text, processed_img
 def process_input(file_input, temperature, page_num, extraction_mode):
     if file_input is None:
-        yield "Please upload an image or PDF first.", "", "", "", "No file!", 1
         return
-    image_to_process = None
-    page_info = ""
-    slider_value = page_num
-    file_path = file_input if isinstance(file_input, str) else file_input.name
-    if file_path.lower().endswith(".pdf"):
-        try:
-            image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
-            page_info = f"Processing page {actual_page} of {total_pages}"
-            slider_value = actual_page
-        except Exception as e:
-            msg = f"Error processing PDF: {str(e)}"
-            yield msg, "", msg, "", None, slider_value
-            return
-    else:
-        try:
-            image_to_process = Image.open(file_path)
-            page_info = "Processing image"
-        except Exception as e:
-            msg = f"Error opening image: {str(e)}"
-            yield msg, "", msg, "", None, slider_value
-            return
-    use_ner = extraction_mode == "Regex"   #"Clinical NER"
-    try:
-        for cleaned_text, raw_md, processed_img in extract_text_from_image(
-            image_to_process, temperature
-        ):
-            meds_out = extract_meds(cleaned_text, use_ner)
-            yield cleaned_text, meds_out, raw_md, page_info, processed_img, slider_value
-    except Exception as e:
-        error_msg = f"Error during text extraction: {str(e)}"
-        yield error_msg, "", error_msg, page_info, image_to_process, slider_value
-def update_slider(file_input):
-    if file_input is None:
-        return gr.update(maximum=20, value=1)
-    file_path = file_input if isinstance(file_input, str) else file_input.name
-    if file_path.lower().endswith('.pdf'):
-        try:
-            pdf = pdfium.PdfDocument(file_path)
-            total_pages = len(pdf)
-            pdf.close()
-            return gr.update(maximum=total_pages, value=1)
-        except:
-            return gr.update(maximum=20, value=1)
-    else:
-        return gr.update(maximum=1, value=1)
 with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
     file_input = gr.File(
-        label="🖼️ Upload Image or PDF",
-        file_types=[".pdf", ".png", ".jpg", ".jpeg"],
         type="filepath"
     )
     temperature = gr.Slider(
@@ -253,24 +145,12 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
         step=0.05,
         label="Temperature"
     )
-    page_slider = gr.Slider(
-        minimum=1, maximum=20, value=1, step=1,
-        label="Page Number (PDF only)",
-        interactive=True
-    )
     extraction_mode = gr.Radio(
         choices=["Clinical NER", "Regex"],
         value="Regex",
         label="Extraction Method",
         info="Clinical NER uses ML, Regex uses rules"
     )
-    output_text = gr.Textbox(
-        label="📝 Extracted Text",
-        lines=4,
-        max_lines=10,
-        interactive=False,
-        show_copy_button=True
-    )
     medicines_output = gr.Textbox(
         label="💊 Extracted Medicines/Drugs",
         placeholder="Medicine/drug names will appear here...",
@@ -279,34 +159,16 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
         interactive=False,
         show_copy_button=True
     )
-    raw_output = gr.Textbox(
-        label="Raw Model Output",
-        lines=2,
-        max_lines=5,
-        interactive=False
-    )
-    page_info = gr.Markdown(
-        value=""  # Info of PDF page
-    )
     rendered_image = gr.Image(
-        label="Processed Image (Thresholded for OCR)",
         interactive=False
     )
-    num_pages = gr.Number(
-        value=1, label="Current Page (slider)", visible=False
-    )
     submit_btn = gr.Button("Extract Medicines", variant="primary")
     submit_btn.click(
         fn=process_input,
-        inputs=[file_input, temperature, page_slider, extraction_mode],
-        outputs=[output_text, medicines_output, raw_output, page_info, rendered_image, num_pages]
-    )
-    file_input.change(
-        fn=update_slider,
-        inputs=[file_input],
-        outputs=[page_slider]
     )
 if __name__ == "__main__":

 #################################################################################################
 import spaces
 import gradio as gr
 from PIL import Image
 import numpy as np
 import cv2
 import re
 def extract_medication_lines(text):
+    pattern = r"""^\s*(
+        T\.?|TAB\.?|TABLET
+        |C\.?|CAP\.?|CAPSULE
         |SYRUP|SYP
         |ORAL
+        |INJ\.?|INJECTION
         |OINTMENT|DROPS|PATCH|SOL\.?|SOLUTION
+    )\s+[A-Z0-9 \-\(\)/,.]+"""
     med_regex = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
     meds = []
     for line in text.split('\n'):
             meds.append(line)
     return '\n'.join(meds)
+def preprocess_image_for_ocr(image):
+    image_rgb = image.convert("RGB")
+    img_np = np.array(image_rgb)
+    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+    adaptive_threshold = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 85,35,
+    )
+    preprocessed_pil = Image.fromarray(adaptive_threshold)
+    return preprocessed_pil
+@spaces.GPU
+def extract_text_from_image(image, temperature=0.2, use_ner=False):
+    # Import and load within GPU context!
+    import torch
+    from transformers import (
+        LightOnOCRForConditionalGeneration,
+        LightOnOCRProcessor,
+        AutoTokenizer, AutoModelForTokenClassification, pipeline,
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    attn_implementation = "sdpa" if device == "cuda" else "eager"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
+        "lightonai/LightOnOCR-1B-1025",
+        attn_implementation=attn_implementation,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+    ).to(device).eval()
+    processor = LightOnOCRProcessor.from_pretrained(
+        "lightonai/LightOnOCR-1B-1025",
+        trust_remote_code=True,
+    )
+    # NER only if requested
     if use_ner:
+        ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+        ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+        ner_pipeline = pipeline(
+            "ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple"
+        )
     processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
             "role": "user",
         return_dict=True,
         return_tensors="pt",
     )
     inputs = {
+        k: (v.to(device=device, dtype=dtype)
             if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
             else v.to(device)
             if isinstance(v, torch.Tensor)
+            else v)
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
     )
     with torch.no_grad():
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
+    cleaned_text = output_text.strip()
+    # Extract medicines
+    if use_ner:
+        entities = ner_pipeline(cleaned_text)
+        meds = []
+        for ent in entities:
+            if ent["entity_group"] == "treatment":
+                word = ent["word"]
+                if word.startswith("##") and meds:
+                    meds[-1] += word[2:]
+                else:
+                    meds.append(word)
+        result_meds = ", ".join(set(meds)) if meds else "None detected"
+    else:
+        result_meds = extract_medication_lines(cleaned_text) or "None detected"
+    yield result_meds, processed_img  # Only medicines and processed image
 def process_input(file_input, temperature, page_num, extraction_mode):
     if file_input is None:
+        yield "Please upload an image or PDF first.", None
         return
+    image_to_process = Image.open(file_input) if not str(file_input).lower().endswith(".pdf") else None  # simplify to image only
+    use_ner = extraction_mode == "Clinical NER"
+    for meds_out, processed_img in extract_text_from_image(image_to_process, temperature, use_ner):
+        yield meds_out, processed_img
 with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
     file_input = gr.File(
+        label="🖼️ Upload Image",
+        file_types=[".png", ".jpg", ".jpeg"],
         type="filepath"
     )
     temperature = gr.Slider(
         step=0.05,
         label="Temperature"
     )
     extraction_mode = gr.Radio(
         choices=["Clinical NER", "Regex"],
         value="Regex",
         label="Extraction Method",
         info="Clinical NER uses ML, Regex uses rules"
     )
     medicines_output = gr.Textbox(
         label="💊 Extracted Medicines/Drugs",
         placeholder="Medicine/drug names will appear here...",
         interactive=False,
         show_copy_button=True
     )
     rendered_image = gr.Image(
+        label="Processed Image (Adaptive Thresholded for OCR)",
         interactive=False
     )
     submit_btn = gr.Button("Extract Medicines", variant="primary")
     submit_btn.click(
         fn=process_input,
+        inputs=[file_input, temperature, 1, extraction_mode],  # page_num not used for image, set to 1
+        outputs=[medicines_output, rendered_image]
     )
 if __name__ == "__main__":