757 lines
23 KiB
Python
757 lines
23 KiB
Python
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
|
|
|
|
def normalize(text):
|
|
text = text or ""
|
|
text = unicodedata.normalize("NFD", text)
|
|
text = "".join(ch for ch in text if not unicodedata.combining(ch))
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
return text.upper()
|
|
|
|
|
|
def extract_digits_from_text(text, min_len=6, max_len=20):
|
|
if not text:
|
|
return None
|
|
match = re.search(rf"\b\d{{{min_len},{max_len}}}\b", text)
|
|
if match:
|
|
return match.group(0)
|
|
groups = re.findall(rf"(?:\d[\s\-]*){{{min_len},{max_len}}}", text)
|
|
for group in groups:
|
|
digits = re.sub(r"\D", "", group)
|
|
if min_len <= len(digits) <= max_len:
|
|
return digits
|
|
return None
|
|
|
|
|
|
def extract_date_from_text(text):
|
|
if not text:
|
|
return None
|
|
match = re.search(
|
|
r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})\b",
|
|
text,
|
|
)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def extract_time_from_text(text):
|
|
if not text:
|
|
return None
|
|
match = re.search(r"\b(\d{1,2}:\d{2})\b", text)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def extract_text_pdfplumber(path):
|
|
try:
|
|
import pdfplumber # type: ignore
|
|
except Exception:
|
|
return ""
|
|
|
|
parts = []
|
|
with pdfplumber.open(path) as pdf:
|
|
for page in pdf.pages:
|
|
parts.append(page.extract_text() or "")
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract_text_fitz(path):
|
|
try:
|
|
import fitz # type: ignore
|
|
except Exception:
|
|
return ""
|
|
|
|
parts = []
|
|
doc = fitz.open(path)
|
|
try:
|
|
for page in doc:
|
|
parts.append(page.get_text() or "")
|
|
finally:
|
|
doc.close()
|
|
return "\n".join(parts)
|
|
|
|
|
|
def configure_tesseract():
|
|
try:
|
|
import pytesseract # type: ignore
|
|
except Exception:
|
|
return None
|
|
|
|
candidates = []
|
|
env_path = os.environ.get("TESSERACT_PATH")
|
|
if env_path:
|
|
candidates.append(env_path)
|
|
|
|
candidates.extend(
|
|
[
|
|
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
|
|
]
|
|
)
|
|
|
|
for path in candidates:
|
|
if path and os.path.isfile(path):
|
|
pytesseract.pytesseract.tesseract_cmd = path
|
|
return path
|
|
|
|
return None
|
|
|
|
|
|
def is_tesseract_available():
|
|
try:
|
|
import pytesseract # type: ignore
|
|
except Exception:
|
|
return False
|
|
|
|
configure_tesseract()
|
|
try:
|
|
_ = pytesseract.get_tesseract_version()
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def extract_text_ocr(path, max_pages=2):
|
|
try:
|
|
import fitz # type: ignore
|
|
import pytesseract # type: ignore
|
|
from PIL import Image # type: ignore
|
|
except Exception:
|
|
return "", "ocr_modules_missing"
|
|
|
|
configure_tesseract()
|
|
try:
|
|
_ = pytesseract.get_tesseract_version()
|
|
except Exception:
|
|
return "", "tesseract_not_found"
|
|
|
|
text_parts = []
|
|
ocr_error = ""
|
|
doc = fitz.open(path)
|
|
try:
|
|
total_pages = min(max_pages, doc.page_count)
|
|
for i in range(total_pages):
|
|
page = doc.load_page(i)
|
|
pix = page.get_pixmap(dpi=200)
|
|
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
|
try:
|
|
text = pytesseract.image_to_string(img, lang="spa") or ""
|
|
except Exception:
|
|
try:
|
|
text = pytesseract.image_to_string(img, lang="eng") or ""
|
|
if not ocr_error:
|
|
ocr_error = "tesseract_lang_missing_spa"
|
|
except Exception:
|
|
return "", "tesseract_lang_missing"
|
|
text_parts.append(text)
|
|
finally:
|
|
doc.close()
|
|
|
|
return "\n".join(text_parts), ocr_error
|
|
|
|
|
|
def extract_name(lines, norm_lines):
|
|
keys = ["DATOS DEL USUARIO", "DATOS DEL PACIENTE"]
|
|
idx = -1
|
|
for key in keys:
|
|
for i, line in enumerate(norm_lines):
|
|
if key in line:
|
|
idx = i
|
|
break
|
|
if idx != -1:
|
|
break
|
|
|
|
if idx == -1:
|
|
for line in lines:
|
|
if normalize(line).startswith("PACIENTE:"):
|
|
parts = line.split(":", 1)
|
|
if len(parts) == 2:
|
|
candidate = parts[1].split("-")[0].strip()
|
|
if candidate:
|
|
return candidate
|
|
return None
|
|
|
|
skip_tokens = [
|
|
"1ER APELLIDO",
|
|
"2DO APELLIDO",
|
|
"1ER NOMBRE",
|
|
"2DO NOMBRE",
|
|
"TIPO DOCUMENTO",
|
|
"DOCUMENTO DE IDENTIFICACION",
|
|
"REGISTRO CIVIL",
|
|
"TARJETA",
|
|
"CEDULA",
|
|
"NUIP",
|
|
]
|
|
|
|
for j in range(idx + 1, min(idx + 6, len(lines))):
|
|
nline = norm_lines[j]
|
|
if any(token in nline for token in skip_tokens):
|
|
continue
|
|
if len(lines[j].split()) >= 2:
|
|
return lines[j]
|
|
|
|
return None
|
|
|
|
|
|
def extract_document(lines, norm_lines):
|
|
idx = -1
|
|
for i, line in enumerate(norm_lines):
|
|
if (
|
|
"TIPO DOCUMENTO" in line
|
|
or "TIPO DE DOCUMENTO" in line
|
|
or "NUMERO DOCUMENTO" in line
|
|
or "DOCUMENTO DE IDENTIFICACION" in line
|
|
):
|
|
idx = i
|
|
break
|
|
|
|
if idx != -1:
|
|
for j in range(idx, min(idx + 8, len(lines))):
|
|
digits = extract_digits_from_text(lines[j])
|
|
if digits:
|
|
return digits
|
|
|
|
for i, line in enumerate(norm_lines):
|
|
if (
|
|
"CEDULA" in line
|
|
or "DOCUMENTO" in line
|
|
or "NUIP" in line
|
|
or "PASAPORTE" in line
|
|
):
|
|
digits = extract_digits_from_text(lines[i])
|
|
if digits:
|
|
return digits
|
|
|
|
return None
|
|
|
|
|
|
def extract_cups_code(text):
|
|
if not text:
|
|
return None
|
|
match = re.search(
|
|
r"\b(?=[A-Z0-9]{4,10}\b)[A-Z]*\d[A-Z0-9]*\b", text, re.IGNORECASE
|
|
)
|
|
if match:
|
|
return match.group(0)
|
|
digits = extract_digits_from_text(text, min_len=4, max_len=10)
|
|
if digits:
|
|
return digits
|
|
return None
|
|
|
|
|
|
def extract_cups_list(lines, norm_lines):
|
|
cups = []
|
|
header_idx = -1
|
|
for i, nline in enumerate(norm_lines):
|
|
if "CUPS" in nline and "CODIGO" in nline:
|
|
header_idx = i
|
|
break
|
|
|
|
stop_tokens = [
|
|
"JUSTIFICACION",
|
|
"IMPRESION",
|
|
"DIAGNOSTICO",
|
|
"INFORMACION",
|
|
"NOMBRE",
|
|
"RESPONSABLE",
|
|
"SOLICITA",
|
|
"SOLICITANTE",
|
|
"FIRMA",
|
|
]
|
|
|
|
def add_cup(code, desc):
|
|
if not code:
|
|
return
|
|
for existing, _ in cups:
|
|
if existing == code:
|
|
return
|
|
cups.append((code, desc or None))
|
|
|
|
if header_idx != -1:
|
|
for j in range(header_idx + 1, min(header_idx + 20, len(lines))):
|
|
nline = norm_lines[j]
|
|
if any(token in nline for token in stop_tokens):
|
|
break
|
|
code = extract_cups_code(lines[j])
|
|
if not code:
|
|
continue
|
|
raw = lines[j]
|
|
desc = ""
|
|
if code in raw:
|
|
desc = raw.split(code, 1)[-1].strip(" -:")
|
|
desc = re.sub(r"^\d+[.,]\d{1,2}(?:\s+|$)", "", desc)
|
|
desc = re.sub(r"^\d+(?:\s+|$)", "", desc)
|
|
if not desc and j + 1 < len(lines):
|
|
desc = lines[j + 1].strip()
|
|
add_cup(code, desc)
|
|
|
|
if not cups:
|
|
for i, line in enumerate(norm_lines):
|
|
if (
|
|
"CUPS" in line
|
|
or re.search(
|
|
r"C\s*[\.\-]?\s*U\s*[\.\-]?\s*P\s*[\.\-]?\s*S",
|
|
line,
|
|
re.IGNORECASE,
|
|
)
|
|
or re.search(r"\bCUP\b", line, re.IGNORECASE)
|
|
):
|
|
for j in range(i, min(i + 8, len(lines))):
|
|
code = extract_cups_code(lines[j])
|
|
if not code:
|
|
continue
|
|
raw = lines[j]
|
|
desc = ""
|
|
if code in raw:
|
|
desc = raw.split(code, 1)[-1].strip(" -:")
|
|
if not desc and j + 1 < len(lines):
|
|
desc = lines[j + 1].strip()
|
|
add_cup(code, desc)
|
|
if cups:
|
|
break
|
|
|
|
return cups
|
|
|
|
|
|
def extract_cups(lines, norm_lines):
|
|
cups = extract_cups_list(lines, norm_lines)
|
|
if cups:
|
|
return cups[0]
|
|
return None, None
|
|
|
|
|
|
def normalize_cie10_code(code):
|
|
if not code:
|
|
return None
|
|
cleaned = re.sub(r"[\s\.]", "", code).upper()
|
|
cleaned = re.sub(r"[^A-Z0-9]", "", cleaned)
|
|
return cleaned or None
|
|
|
|
|
|
def clean_diagnosis_desc(desc):
|
|
if not desc:
|
|
return None
|
|
desc = re.sub(r"(?i)impresion diagnostica", "", desc)
|
|
desc = re.sub(
|
|
r"(?i)diagnostico(s)?( principal| de egreso| egreso| principal)?", "", desc
|
|
)
|
|
desc = re.sub(r"(?i)dx( principal| de egreso| egreso| secundaria| secundario)?", "", desc)
|
|
desc = re.sub(r"(?i)cie[-\s]*10", "", desc)
|
|
desc = desc.strip(" -:")
|
|
return desc or None
|
|
|
|
|
|
def extract_cie10_codes_from_line(line):
|
|
if not line:
|
|
return []
|
|
matches = re.findall(
|
|
r"\b([A-Z]\s*\d{2,4}(?:\s*\.\s*\d{1,2})?[A-Z0-9]?)\b",
|
|
line,
|
|
re.IGNORECASE,
|
|
)
|
|
codes = []
|
|
for match in matches:
|
|
code = normalize_cie10_code(match)
|
|
if code and code not in codes:
|
|
codes.append(code)
|
|
return codes
|
|
|
|
|
|
def parse_cie10_from_line(line):
|
|
if not line:
|
|
return None, None
|
|
match = re.search(
|
|
r"\(([A-Z][0-9]{2,4}(?:\.[0-9]{1,2})?[A-Z0-9]?)\)",
|
|
line,
|
|
re.IGNORECASE,
|
|
)
|
|
if match:
|
|
code = normalize_cie10_code(match.group(1))
|
|
desc = clean_diagnosis_desc(line[: match.start()].strip(" -:"))
|
|
return code, desc.title() if desc else None
|
|
match = re.search(
|
|
r"\b([A-Z]\s*\d{2,4}(?:\s*\.\s*\d{1,2})?[A-Z0-9]?)\b",
|
|
line,
|
|
re.IGNORECASE,
|
|
)
|
|
if match:
|
|
code = normalize_cie10_code(match.group(1))
|
|
desc = line[match.end():].strip(" -:")
|
|
if not desc:
|
|
desc = line[: match.start()].strip(" -:")
|
|
desc = clean_diagnosis_desc(desc)
|
|
return code, desc.title() if desc else None
|
|
return None, None
|
|
|
|
|
|
def extract_cups_hint(lines, norm_lines):
|
|
keys = [
|
|
"EXAMENES Y PROCEDIMIENTOS ORDENADOS",
|
|
"EXAMENES Y PROCEDIMIENTOS",
|
|
"PROCEDIMIENTOS ORDENADOS",
|
|
"PROCEDIMIENTOS",
|
|
]
|
|
idx = -1
|
|
for key in keys:
|
|
for i, line in enumerate(norm_lines):
|
|
if key in line:
|
|
idx = i
|
|
break
|
|
if idx != -1:
|
|
break
|
|
|
|
if idx == -1:
|
|
return None
|
|
|
|
for j in range(idx + 1, min(idx + 10, len(lines))):
|
|
raw = lines[j].strip(" -*")
|
|
nline = norm_lines[j]
|
|
if not raw or "ORDEN NRO" in nline or "ORDEN NO" in nline:
|
|
continue
|
|
if len(raw.split()) < 2:
|
|
continue
|
|
candidate = raw.split("(")[0].strip(" -")
|
|
candidate = re.sub(r"(?i)AUTOMATIZADO", "", candidate).strip(" -")
|
|
if candidate:
|
|
return candidate
|
|
return None
|
|
|
|
|
|
def extract_cie10_list(lines, norm_lines):
|
|
results = []
|
|
seen = set()
|
|
diag_tokens = [
|
|
"DIAGNOSTICO",
|
|
"DIAGNOSTICOS",
|
|
"IMPRESION DIAGNOSTICA",
|
|
"CIE10",
|
|
"CIE-10",
|
|
"DX",
|
|
"DX PRINCIPAL",
|
|
"DX EGRESO",
|
|
"DIAGNOSTICO PRINCIPAL",
|
|
"DIAGNOSTICO DE EGRESO",
|
|
"DIAGNOSTICO EGRESO",
|
|
]
|
|
for i, nline in enumerate(norm_lines):
|
|
if not any(token in nline for token in diag_tokens):
|
|
continue
|
|
for j in range(i, min(i + 12, len(lines))):
|
|
code, desc = parse_cie10_from_line(lines[j])
|
|
if code and code not in seen:
|
|
seen.add(code)
|
|
results.append((code, desc))
|
|
for extra_code in extract_cie10_codes_from_line(lines[j]):
|
|
if extra_code and extra_code not in seen:
|
|
seen.add(extra_code)
|
|
results.append((extra_code, None))
|
|
return results
|
|
|
|
|
|
def extract_cie10(lines, norm_lines):
|
|
cie_list = extract_cie10_list(lines, norm_lines)
|
|
if cie_list:
|
|
return cie_list[0]
|
|
return None, None
|
|
|
|
|
|
def extract_fecha_ingreso_urgencias(lines, norm_lines):
|
|
keys = [
|
|
"INGRESO A URGENCIAS",
|
|
"INGRESO URGENCIAS",
|
|
"FECHA DE INGRESO",
|
|
"FECHA INGRESO",
|
|
"INGRESA",
|
|
"INGRESO",
|
|
]
|
|
for i, nline in enumerate(norm_lines):
|
|
if not any(key in nline for key in keys):
|
|
continue
|
|
match = re.search(
|
|
r"INGRES[AO][^0-9]{0,20}(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})",
|
|
nline,
|
|
)
|
|
if match:
|
|
return match.group(1)
|
|
date = extract_date_from_text(lines[i])
|
|
if not date and i + 1 < len(lines):
|
|
date = extract_date_from_text(lines[i + 1])
|
|
if date:
|
|
time = extract_time_from_text(lines[i]) or (
|
|
extract_time_from_text(lines[i + 1]) if i + 1 < len(lines) else None
|
|
)
|
|
return f"{date} {time}" if time else date
|
|
return None
|
|
|
|
|
|
def extract_fecha_egreso(lines, norm_lines):
|
|
keys = [
|
|
"FECHA DE EGRESO",
|
|
"FECHA EGRESO",
|
|
"EGRESO",
|
|
"EGRESA",
|
|
"ALTA MEDICA",
|
|
"ALTA HOSPITALARIA",
|
|
"FECHA DE ALTA",
|
|
"FECHA ALTA",
|
|
]
|
|
candidates = []
|
|
for i, nline in enumerate(norm_lines):
|
|
if not any(key in nline for key in keys):
|
|
continue
|
|
match = re.search(
|
|
r"(EGRESO|EGRESA|ALTA)[^0-9]{0,40}(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})",
|
|
nline,
|
|
)
|
|
date = match.group(2) if match else None
|
|
if not date:
|
|
date = extract_date_from_text(lines[i])
|
|
if not date and i + 1 < len(lines):
|
|
date = extract_date_from_text(lines[i + 1])
|
|
if date:
|
|
time = extract_time_from_text(lines[i]) or (
|
|
extract_time_from_text(lines[i + 1]) if i + 1 < len(lines) else None
|
|
)
|
|
candidates.append(f"{date} {time}" if time else date)
|
|
return candidates[-1] if candidates else None
|
|
|
|
|
|
def clean_ips_name(value):
|
|
if not value:
|
|
return ""
|
|
text = value.strip()
|
|
text = re.split(r"\bCC\b", text, maxsplit=1, flags=re.IGNORECASE)[0]
|
|
text = re.split(r"\bNUMERO\b", text, maxsplit=1, flags=re.IGNORECASE)[0]
|
|
text = re.split(r"\bNIT\b", text, maxsplit=1, flags=re.IGNORECASE)[0]
|
|
text = re.split(r"\bCODIGO\b", text, maxsplit=1, flags=re.IGNORECASE)[0]
|
|
return text.strip(" -:")
|
|
|
|
|
|
def extract_ips(lines, norm_lines):
|
|
nombre = None
|
|
nit = None
|
|
idx = -1
|
|
for i, line in enumerate(norm_lines):
|
|
if "INFORMACION DEL PRESTADOR" in line:
|
|
idx = i
|
|
break
|
|
|
|
if idx != -1:
|
|
for j in range(idx + 1, min(idx + 8, len(lines))):
|
|
if not nit and "NIT" in norm_lines[j]:
|
|
nit = extract_digits_from_text(lines[j], min_len=6, max_len=15)
|
|
if not nombre:
|
|
if "NOMBRE" in norm_lines[j] and ":" in lines[j] and "NIT" not in norm_lines[j]:
|
|
candidate = lines[j].split(":", 1)[1].strip()
|
|
candidate = clean_ips_name(candidate)
|
|
if len(candidate.split()) >= 2:
|
|
nombre = candidate
|
|
continue
|
|
if (
|
|
"NOMBRE" not in norm_lines[j]
|
|
and "NIT" not in norm_lines[j]
|
|
and "CODIGO" not in norm_lines[j]
|
|
):
|
|
candidate = clean_ips_name(lines[j])
|
|
if len(candidate.split()) >= 2:
|
|
nombre = candidate
|
|
if nombre and nit:
|
|
break
|
|
|
|
if not nombre:
|
|
for line in lines[:10]:
|
|
if re.search(r"\b(HOSPITAL|CLINICA|ESE|IPS|CENTRO MEDICO)\b", line, re.IGNORECASE):
|
|
candidate = clean_ips_name(line)
|
|
if len(candidate.split()) >= 2:
|
|
nombre = candidate
|
|
break
|
|
|
|
return nombre or None, nit or None
|
|
|
|
|
|
def detect_format(norm_text, norm_lines):
|
|
if "ANEXO TECNICO" in norm_text or "SOLICITUD DE AUTORIZACION" in norm_text:
|
|
return "ANEXO_TECNICO"
|
|
for line in norm_lines:
|
|
if "ATENCION INICIAL DE URGENCIAS" in line:
|
|
return "ANEXO_URGENCIAS"
|
|
return "DESCONOCIDO"
|
|
|
|
|
|
def merge_unique(base_list, extra_list):
|
|
result = list(base_list or [])
|
|
for item in extra_list or []:
|
|
if item not in result:
|
|
result.append(item)
|
|
return result
|
|
|
|
|
|
def merge_response(base, extra):
|
|
result = dict(base)
|
|
|
|
if base.get("cups_codigos"):
|
|
result["cups_codigos"] = list(base.get("cups_codigos") or [])
|
|
result["cups_descripciones"] = list(base.get("cups_descripciones") or [])
|
|
else:
|
|
result["cups_codigos"] = merge_unique(
|
|
base.get("cups_codigos"), extra.get("cups_codigos")
|
|
)
|
|
result["cups_descripciones"] = merge_unique(
|
|
base.get("cups_descripciones"), extra.get("cups_descripciones")
|
|
)
|
|
result["cie10_codigos"] = merge_unique(
|
|
base.get("cie10_codigos"), extra.get("cie10_codigos")
|
|
)
|
|
result["cie10_descripciones"] = merge_unique(
|
|
base.get("cie10_descripciones"), extra.get("cie10_descripciones")
|
|
)
|
|
|
|
if not result.get("cup_codigo") and extra.get("cup_codigo"):
|
|
result["cup_codigo"] = extra.get("cup_codigo")
|
|
result["cup_descripcion"] = extra.get("cup_descripcion")
|
|
|
|
if not result.get("cie10_codigo") and extra.get("cie10_codigo"):
|
|
result["cie10_codigo"] = extra.get("cie10_codigo")
|
|
result["cie10_descripcion"] = extra.get("cie10_descripcion")
|
|
|
|
if not result.get("ips_nombre") and extra.get("ips_nombre"):
|
|
result["ips_nombre"] = extra.get("ips_nombre")
|
|
if not result.get("ips_nit") and extra.get("ips_nit"):
|
|
result["ips_nit"] = extra.get("ips_nit")
|
|
if not result.get("fecha_ingreso_urgencias") and extra.get(
|
|
"fecha_ingreso_urgencias"
|
|
):
|
|
result["fecha_ingreso_urgencias"] = extra.get("fecha_ingreso_urgencias")
|
|
if not result.get("fecha_egreso") and extra.get("fecha_egreso"):
|
|
result["fecha_egreso"] = extra.get("fecha_egreso")
|
|
if not result.get("cups_busqueda") and extra.get("cups_busqueda"):
|
|
result["cups_busqueda"] = extra.get("cups_busqueda")
|
|
if result.get("formato") == "DESCONOCIDO" and extra.get("formato"):
|
|
result["formato"] = extra.get("formato")
|
|
|
|
if not result.get("cup_codigo") and result.get("cups_codigos"):
|
|
result["cup_codigo"] = result["cups_codigos"][0]
|
|
if not result.get("cie10_codigo") and result.get("cie10_codigos"):
|
|
result["cie10_codigo"] = ", ".join(result["cie10_codigos"])
|
|
if not result.get("cie10_descripcion") and result.get("cie10_descripciones"):
|
|
result["cie10_descripcion"] = ", ".join(
|
|
[d for d in result["cie10_descripciones"] if d]
|
|
)
|
|
|
|
warnings = list(result.get("warnings") or [])
|
|
if result.get("cup_codigo") or result.get("cups_codigos"):
|
|
warnings = [w for w in warnings if w != "cups_not_found"]
|
|
if result.get("cie10_codigo") or result.get("cie10_codigos"):
|
|
warnings = [w for w in warnings if w != "cie10_not_found"]
|
|
if result.get("ips_nombre") or result.get("ips_nit"):
|
|
warnings = [w for w in warnings if w != "ips_not_found"]
|
|
result["warnings"] = warnings
|
|
|
|
return result
|
|
|
|
|
|
def build_response(text, ocr_used, ocr_available, ocr_error):
|
|
lines = [line.strip() for line in (text or "").split("\n") if line.strip()]
|
|
norm_lines = [normalize(line) for line in lines]
|
|
norm_text = normalize(text)
|
|
|
|
nombre = extract_name(lines, norm_lines)
|
|
documento = extract_document(lines, norm_lines)
|
|
cups_list = extract_cups_list(lines, norm_lines)
|
|
cup_codigo, cup_desc = (cups_list[0] if cups_list else (None, None))
|
|
cups_codigos = [item[0] for item in cups_list]
|
|
cups_descripciones = [item[1] for item in cups_list]
|
|
cups_busqueda = extract_cups_hint(lines, norm_lines)
|
|
cie_list = extract_cie10_list(lines, norm_lines)
|
|
cie_codigos = [item[0] for item in cie_list]
|
|
cie_descs = [item[1] for item in cie_list]
|
|
cie_codigo = ", ".join(cie_codigos) if cie_codigos else None
|
|
cie_desc = ", ".join([d for d in cie_descs if d]) if any(cie_descs) else None
|
|
ips_nombre, ips_nit = extract_ips(lines, norm_lines)
|
|
fecha_ingreso = extract_fecha_ingreso_urgencias(lines, norm_lines)
|
|
fecha_egreso = extract_fecha_egreso(lines, norm_lines)
|
|
formato = detect_format(norm_text, norm_lines)
|
|
|
|
warnings = []
|
|
if not text:
|
|
warnings.append("no_text_extracted")
|
|
if not cup_codigo and not cups_codigos:
|
|
warnings.append("cups_not_found")
|
|
if not cie_codigo and not cie_codigos:
|
|
warnings.append("cie10_not_found")
|
|
if not ips_nombre and not ips_nit:
|
|
warnings.append("ips_not_found")
|
|
|
|
return {
|
|
"ok": True,
|
|
"text_length": len(norm_text),
|
|
"ocr_usado": ocr_used,
|
|
"ocr_disponible": ocr_available,
|
|
"ocr_error": ocr_error or None,
|
|
"formato": formato,
|
|
"nombre_paciente": nombre,
|
|
"numero_documento": documento,
|
|
"cup_codigo": cup_codigo,
|
|
"cup_descripcion": cup_desc,
|
|
"cups_codigos": cups_codigos,
|
|
"cups_descripciones": cups_descripciones,
|
|
"cups_busqueda": cups_busqueda,
|
|
"cie10_codigo": cie_codigo,
|
|
"cie10_descripcion": cie_desc,
|
|
"cie10_codigos": cie_codigos,
|
|
"cie10_descripciones": cie_descs,
|
|
"ips_nombre": ips_nombre,
|
|
"ips_nit": ips_nit,
|
|
"fecha_ingreso_urgencias": fecha_ingreso,
|
|
"fecha_egreso": fecha_egreso,
|
|
"warnings": warnings,
|
|
}
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(json.dumps({"ok": False, "error": "missing_file"}, ensure_ascii=True))
|
|
return
|
|
|
|
path = sys.argv[1]
|
|
|
|
text = extract_text_pdfplumber(path)
|
|
if not text:
|
|
text = extract_text_fitz(path)
|
|
|
|
ocr_used = False
|
|
ocr_error = ""
|
|
ocr_available = is_tesseract_available()
|
|
|
|
response = build_response(text, False, ocr_available, None)
|
|
needs_ocr = ocr_available and (
|
|
not response.get("cup_codigo") or not response.get("cie10_codigo")
|
|
)
|
|
|
|
if needs_ocr:
|
|
ocr_text, ocr_error = extract_text_ocr(path)
|
|
if ocr_text:
|
|
ocr_used = True
|
|
ocr_response = build_response(ocr_text, True, ocr_available, ocr_error)
|
|
response = merge_response(response, ocr_response)
|
|
if ocr_error:
|
|
response["ocr_error"] = ocr_error
|
|
|
|
response["ocr_usado"] = bool(ocr_used)
|
|
response["ocr_disponible"] = bool(ocr_available)
|
|
if "ocr_error" not in response:
|
|
response["ocr_error"] = ocr_error or None
|
|
|
|
print(json.dumps(response, ensure_ascii=True))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|