-
Notifications
You must be signed in to change notification settings - Fork 5
lixo, vias de curitiba
Peter edited this page Nov 12, 2018
·
8 revisions
---- Curitiba from planet_osm:
ALTER TABLE curitiba_lotes201807 ADD COLUMN street_lexlabel text;
UPDATE curitiba_lotes201807 SET street_lexlabel=stable.name2lex(street);
drop VIEW vw_curitiba_vias;
CREATE VIEW vw_curitiba_vias AS
SELECT osm_id, osm_id2, 'road' as tp,
tags->>'name' as name, stable.name2lex(tags->>'name') lexlabel,
way
FROM planet_osm_roads
WHERE osm_id IN (
select osm_id from stable.city_test_inside
where city_id=297514
) and tags->>'name' > ''
ORDER BY 1,3,2,4
;
-- nomes de via no OSM:
-- SELECT name, array_agg(DISTINCT osm_id order by osm_id) osm_ids
-- FROM vw_curitiba_vias group by 1 order by 1
CREATE VIEW vw_curitiba_vias_lexlabel_equiparadas AS
SELECT vianome_osm, vianome_lote, c.street_lexlabel, array_agg_cat(v.osm_ids) osm_ids
FROM (
SELECT name as vianome_osm, stable.name2lex(name) as lexlabel, count(*), array_agg(osm_id) as osm_ids
FROM vw_curitiba_vias GROUP BY 1,2
) v INNER JOIN (
SELECT DISTINCT street as vianome_lote, street_lexlabel
FROM curitiba_lotes201807
) c
ON v.lexlabel=c.street_lexlabel -- destacar ~40 equiparações com AND vianome_osm!=vianome_lote
GROUP BY 1,2,3
; -- ~300 vias
CREATE VIEW vw_curitiba_lotes201807_vizinhos AS
SELECT street_lexlabel, st_geohash(geom) as geohash
FROM curitiba_lotes201807
;
Auxiliares reusadas, a simplificar:
-----------
CREATE FUNCTION stable.normalizeterm(
--
-- Converts string into standard sequence of lower-case words.
-- Para uso nas URNs, NAO USAR para normalização de texto em geral TXT.
--
text, -- 1. input string (many words separed by spaces or punctuation)
text DEFAULT ' ', -- 2. output separator
int DEFAULT 0, -- 3. max lenght of the result (system limit). 0=full.
p_sep2 text DEFAULT ' , ' -- 4. output separator between terms
) RETURNS text AS $f$
SELECT substring(
LOWER(TRIM( regexp_replace( -- for review: regex(regex()) for ` , , ` remove
trim(regexp_replace($1,E'[\\n\\r \\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*[\\+/,;:\\(\\)\\{\\}\\[\\]="]+[\\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*|[\\s ]+[–\\-][\\s ]+',
p_sep2, 'g'),' ,'), -- s*ps*|s-s
E'[\\s ;\\|"]+[\\.\'][\\s ;\\|"]+|[\\s ;\\|"]+', -- s.s|s
$2,
'g'
), $2 )),
1,
CASE WHEN $3<=0 OR $3 IS NULL THEN char_length($1) ELSE $3 END
);
$f$ LANGUAGE SQL IMMUTABLE;
CREATE FUNCTION stable.normalizeterm2(
text,
boolean DEFAULT true -- cut
) RETURNS text AS $f$
SELECT ( stable.normalizeterm(
CASE WHEN $2 THEN substring($1 from '^[^\(\)\/;]+' ) ELSE $1 END,
' ',
255,
' / '
));
$f$ LANGUAGE SQL IMMUTABLE;
CREATE or replace FUNCTION stable.name2lex(
-- usar unaccent(x) e [^\w], antes convertendo D'Xx para Xx e preservando data-iso com _x_
p_name text
,p_normalize boolean DEFAULT true
,p_cut boolean DEFAULT true
,p_flag boolean DEFAULT true -- unaccent flag
) RETURNS text AS $f$
SELECT CASE WHEN p_flag THEN urn ELSE urn END
FROM (
SELECT trim(replace(
regexp_replace(
CASE
WHEN p_normalize AND p_flag THEN stable.normalizeterm2(unaccent($1),p_cut)
WHEN p_normalize THEN stable.normalizeterm2($1,p_cut)
ELSE $1
END,
E' d[aeo] | d[oa]s | com | para |^d[aeo] | / .+| [aeo]s | [aeo] |[\-\' ]',
'.',
'g'
),
'..',
'.'
),'.')
) t(urn)
$f$ LANGUAGE SQL IMMUTABLE;
Algoritmo para vincular via com lotes de nome semelhante.
CREATE MATERIALIZED VIEW vw_curitiba_lotes201807_merged1 AS
SELECT * --, lado_medio_m::float/lado_medio_deg metric_factor
FROM (
SELECT street_lexlabel, substr(st_geohash(geom),1,6) as geohash6,
count(*) n
,st_union(array_agg(st_buffer(geom,0.00001))) merged_geom -- dissolving limits
FROM curitiba_lotes201807
where street_lexlabel>''
group by 2,1 order by 2,1
) t
WHERE length(geohash6)=6 and n>3
; -- 11574
CREATE MATERIALIZED VIEW vw_curitiba_vias_candidatas1 AS
SELECT v.osm_id, ST_Distance(v.way,m.merged_geom) as dist, -- avg=0.013 graus
v.lexlabel, m.street_lexlabel
FROM vw_curitiba_vias v INNER JOIN vw_curitiba_lotes201807_merged1 m
ON ST_DWithin(m.merged_geom,v.way,0.06) -- distância em graus: 0.1 a 0.0001, centro em 0.013
AND levenshtein(v.lexlabel, m.street_lexlabel) < 5 -- nomes similares: 2 a 10
-- perc= levenshtein(v.lexlabel, m.street_lexlabel)::float / greatest(length(v.lexlabel), length(m.street_lexlabel)) < 0.05
WHERE v.osm_id NOT IN (
select distinct unnest(osm_ids) osm_id from vw_curitiba_vias_lexlabel_equiparadas
)
; -- ~500 conforme parametros
-- avaliando
SELECT lexlabel, street_lexlabel ,
count(*) n,
array[round(min(dist),4), round(avg(dist),4), round(max(dist),4)] dists
FROM vw_curitiba_vias_candidatas1 group by 1,2
having count(*)>1
;
-- relatorio
COPY (
SELECT lexlabel, street_lexlabel ,
count(*) n,
array[round(min(dist),4), round(avg(dist),4), round(max(dist),4)] dists, array_agg(osm_id) osm_ids
FROM vw_curitiba_vias_candidatas1 group by 1,2
) TO '/tmp/curitiba_vias_candidatas.csv' CSV HEADER;
COPY (
select * from vw_curitiba_vias_lexlabel_equiparadas
) to '/tmp/curitiba_vias_equiparadas.csv' CSV HEADER;
-- 314
-- usar st_buffer(via,0.002 a 0.008) nas vias que cruzam lotes de vw_curitiba_vias_lexlabel_equiparadas
Com essa estratégia resta eleger duas palavras menos frequentes em ordem alfabetica para determinar semelhança por proximidade: listar distâncias para firmar buffer.