IFMedTechdemo commited on
Commit
9b81dbc
·
verified ·
1 Parent(s): a7d8613

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -17
app.py CHANGED
@@ -19,25 +19,37 @@ def preprocess_image_for_ocr(image):
19
  return preprocessed_pil
20
 
21
 
 
22
  def extract_medication_lines(text):
23
- form_pattern = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
24
- name_pattern = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
25
- dose_pattern = r"(\d{1,4}\s*(mg|ml|mcg|g|kg|units|IU)|\d{1,2}\s*%(\s*w\/w|\s*w\/v|\s*v\/v)?)"
26
- main_pattern = (
27
- r"(?<!\w)(" + form_pattern + r")[\s\-]+"
28
- r"" + name_pattern + r"" # name after form
29
- r"[^|,\n]{0,50}?"
30
- r"" + dose_pattern + r"" # dose somewhere after name
 
 
 
 
 
 
 
 
 
31
  )
32
- med_regex = re.compile(main_pattern, re.IGNORECASE)
33
- meds = []
34
- for line in text.split('\n'):
35
- line_stripped = line.strip()
36
- match = med_regex.search(line_stripped)
37
- if match:
38
- # Ignore group indices, instead join non-None groups or use match.group(0)
39
- meds.append(match.group(0).strip())
40
- return '\n'.join(meds)
 
 
41
 
42
 
43
 
 
19
  return preprocessed_pil
20
 
21
 
22
+
23
  def extract_medication_lines(text):
24
+ """
25
+ Extracts Rx lines with:
26
+ - form + name (+ repeated form)
27
+ - optional numeric + unit or number/slash as dose
28
+ """
29
+
30
+ # Forms and name up to 4 tokens, allow form at start or end (or both)
31
+ form = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
32
+ name = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
33
+ opt_form = fr"(?:\s+{form})?" # form can repeat at end
34
+ # Dose: optional
35
+ opt_dose = r"(?:\s*\d{1,4}(?:\/\d{1,4})?\s*(mg|ml|mcg|g|kg|units|IU|%|))?"
36
+
37
+ # Compile pattern to match: form name [form] [dose/conc]
38
+ pat = re.compile(
39
+ fr"\b{form}\s+{name}{opt_form}{opt_dose}\b",
40
+ re.IGNORECASE
41
  )
42
+
43
+ lines = text.split('\n')
44
+ matches = set()
45
+ for line in lines:
46
+ line = line.strip()
47
+ for m in pat.finditer(line):
48
+ out = m.group(0)
49
+ out = re.sub(r"\s+", " ", out).strip() # normalize spaces
50
+ matches.add(out.upper())
51
+ return '\n'.join(matches)
52
+
53
 
54
 
55