rebuild_catalog.py
2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
import unicodedata
def normalize(text):
if not text: return ""
text = text.upper()
# Eliminar acentos
text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
# Limpieza de términos comunes
for word in ['MUNICIPALIDAD DE ', 'CIUDAD ', 'VILLA ', 'SAN ', 'SANTA ', 'DOCTOR ', 'DR. ']:
text = text.replace(word, '')
return text.strip()
def levenshtein(s1, s2):
if len(s1) < len(s2):
return levenshtein(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
ENTIDADES_FILE = "/yvyape/proyectos/sigem-gis/sigem_entidades.txt"
JSON_FILE = "/yvyape/proyectos/sigem-gis/snc_ly_dist.json"
OUTPUT_FILE = "/yvyape/proyectos/sigem-gis/reconstruccion_maestra_268.sql"
# Cargar entidades SIGEM
entidades = {}
with open(ENTIDADES_FILE, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
if len(parts) >= 2:
raw_name = parts[1]
entidades[normalize(raw_name)] = parts[0]
# Procesar JSON del SNC
with open(JSON_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
sql_lines = []
for feature in data['features']:
props = feature['properties']
dpto = props.get('cod_dpto')
dist = props.get('cod_dist')
nombre = normalize(props.get('nom_dist', ''))
# Intento 1: Match Exacto Normalizado
match_id = entidades.get(nombre)
# Intento 2: Fuzzy Match (Levenshtein)
if not match_id:
best_score = 999
for sigem_name, sigem_id in entidades.items():
dist_val = levenshtein(nombre, sigem_name)
if dist_val < 3 and dist_val < best_score:
best_score = dist_val
match_id = sigem_id
if not match_id:
match_id = f"99{dpto}{dist}"
sql_lines.append(f"('{match_id}', '{dpto}', {dist})")
# Escribir SQL
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write("DELETE FROM public.snc_catalog_mapping;\n")
f.write("INSERT INTO public.snc_catalog_mapping (entidad_id, dpto_snc, dist_snc) VALUES \n")
f.write(",\n".join(sql_lines))
f.write(";\n")
print(f"Reconstrucción finalizada: {len(sql_lines)} registros con lógica Fuzzy Match.")