rebuild_catalog.py 2.6 KB
import json
import unicodedata

def normalize(text):
    if not text: return ""
    text = text.upper()
    # Eliminar acentos
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    # Limpieza de términos comunes
    for word in ['MUNICIPALIDAD DE ', 'CIUDAD ', 'VILLA ', 'SAN ', 'SANTA ', 'DOCTOR ', 'DR. ']:
        text = text.replace(word, '')
    return text.strip()

def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

ENTIDADES_FILE = "/yvyape/proyectos/sigem-gis/sigem_entidades.txt"
JSON_FILE = "/yvyape/proyectos/sigem-gis/snc_ly_dist.json"
OUTPUT_FILE = "/yvyape/proyectos/sigem-gis/reconstruccion_maestra_268.sql"

# Cargar entidades SIGEM
entidades = {}
with open(ENTIDADES_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('|')
        if len(parts) >= 2:
            raw_name = parts[1]
            entidades[normalize(raw_name)] = parts[0]

# Procesar JSON del SNC
with open(JSON_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

sql_lines = []
for feature in data['features']:
    props = feature['properties']
    dpto = props.get('cod_dpto')
    dist = props.get('cod_dist')
    nombre = normalize(props.get('nom_dist', ''))
    
    # Intento 1: Match Exacto Normalizado
    match_id = entidades.get(nombre)
    
    # Intento 2: Fuzzy Match (Levenshtein)
    if not match_id:
        best_score = 999
        for sigem_name, sigem_id in entidades.items():
            dist_val = levenshtein(nombre, sigem_name)
            if dist_val < 3 and dist_val < best_score:
                best_score = dist_val
                match_id = sigem_id
                
    if not match_id:
        match_id = f"99{dpto}{dist}"
    
    sql_lines.append(f"('{match_id}', '{dpto}', {dist})")

# Escribir SQL
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    f.write("DELETE FROM public.snc_catalog_mapping;\n")
    f.write("INSERT INTO public.snc_catalog_mapping (entidad_id, dpto_snc, dist_snc) VALUES \n")
    f.write(",\n".join(sql_lines))
    f.write(";\n")

print(f"Reconstrucción finalizada: {len(sql_lines)} registros con lógica Fuzzy Match.")
GitLab Appliance - Powered by TurnKey Linux