LightOnOCR

Paused

IFMedTechdemo commited on 27 days ago

Commit

9b81dbc

verified ·

1 Parent(s): a7d8613

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,25 +19,37 @@ def preprocess_image_for_ocr(image):
     return preprocessed_pil
 def extract_medication_lines(text):
-    form_pattern = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
-    name_pattern = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
-    dose_pattern = r"(\d{1,4}\s*(mg|ml|mcg|g|kg|units|IU)|\d{1,2}\s*%(\s*w\/w|\s*w\/v|\s*v\/v)?)"
-    main_pattern = (
-        r"(?<!\w)(" + form_pattern + r")[\s\-]+"
-        r"" + name_pattern + r""  # name after form
-        r"[^|,\n]{0,50}?"
-        r"" + dose_pattern + r""  # dose somewhere after name
     )
-    med_regex = re.compile(main_pattern, re.IGNORECASE)
-    meds = []
-    for line in text.split('\n'):
-        line_stripped = line.strip()
-        match = med_regex.search(line_stripped)
-        if match:
-            # Ignore group indices, instead join non-None groups or use match.group(0)
-            meds.append(match.group(0).strip())
-    return '\n'.join(meds)

     return preprocessed_pil
 def extract_medication_lines(text):
+    """
+    Extracts Rx lines with:
+    - form + name (+ repeated form)
+    - optional numeric + unit or number/slash as dose
+    """
+    # Forms and name up to 4 tokens, allow form at start or end (or both)
+    form = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
+    name = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
+    opt_form = fr"(?:\s+{form})?"  # form can repeat at end
+    # Dose: optional
+    opt_dose = r"(?:\s*\d{1,4}(?:\/\d{1,4})?\s*(mg|ml|mcg|g|kg|units|IU|%|))?"
+    # Compile pattern to match: form name [form] [dose/conc]
+    pat = re.compile(
+        fr"\b{form}\s+{name}{opt_form}{opt_dose}\b",
+        re.IGNORECASE
     )
+    lines = text.split('\n')
+    matches = set()
+    for line in lines:
+        line = line.strip()
+        for m in pat.finditer(line):
+            out = m.group(0)
+            out = re.sub(r"\s+", " ", out).strip()  # normalize spaces
+            matches.add(out.upper())
+    return '\n'.join(matches)