Skip to content

lixo, vias de curitiba

Peter edited this page Nov 12, 2018 · 8 revisions
---- Curitiba from planet_osm:

ALTER TABLE curitiba_lotes201807 ADD COLUMN street_lexlabel text;
UPDATE curitiba_lotes201807 SET street_lexlabel=stable.name2lex(street);
drop VIEW vw_curitiba_vias;
CREATE VIEW vw_curitiba_vias AS
  SELECT osm_id, osm_id2, 'road' as tp, 
         tags->>'name' as name, stable.name2lex(tags->>'name') lexlabel,
         way
  FROM  planet_osm_roads 
  WHERE osm_id IN (
    select osm_id from  stable.city_test_inside 
    where city_id=297514
  ) and tags->>'name' > ''
  ORDER BY 1,3,2,4
;

-- nomes de via no OSM:
-- SELECT name, array_agg(DISTINCT osm_id order by osm_id) osm_ids 
-- FROM vw_curitiba_vias group by 1 order by 1

CREATE VIEW vw_curitiba_vias_lexlabel_equiparadas AS
 SELECT vianome_osm, vianome_lote, c.street_lexlabel, array_agg_cat(v.osm_ids) osm_ids
 FROM (
  SELECT  name as vianome_osm, stable.name2lex(name) as lexlabel, count(*), array_agg(osm_id) as osm_ids 
  FROM vw_curitiba_vias GROUP BY 1,2
 ) v INNER JOIN (
   SELECT  DISTINCT street as vianome_lote, street_lexlabel 
   FROM curitiba_lotes201807
 ) c
 ON v.lexlabel=c.street_lexlabel  -- destacar ~40 equiparações com AND vianome_osm!=vianome_lote
 GROUP BY 1,2,3
; -- ~300 vias

CREATE VIEW vw_curitiba_lotes201807_vizinhos AS
   SELECT street_lexlabel, st_geohash(geom) as geohash
   FROM curitiba_lotes201807
;

Auxiliares reusadas, a simplificar:

-----------

CREATE FUNCTION stable.normalizeterm(
	--
	-- Converts string into standard sequence of lower-case words.
  -- Para uso nas URNs, NAO USAR para normalização de texto em geral TXT.
	--
	text,       		-- 1. input string (many words separed by spaces or punctuation)
	text DEFAULT ' ', 	-- 2. output separator
	int DEFAULT 0,	-- 3. max lenght of the result (system limit). 0=full.
	p_sep2 text DEFAULT ' , ' -- 4. output separator between terms
) RETURNS text AS $f$
  SELECT  substring(
	LOWER(TRIM( regexp_replace(  -- for review: regex(regex()) for ` , , ` remove
		trim(regexp_replace($1,E'[\\n\\r \\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*[\\+/,;:\\(\\)\\{\\}\\[\\]="]+[\\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*|[\\s ]+[–\\-][\\s ]+',
				   p_sep2, 'g'),' ,'),   -- s*ps*|s-s
		E'[\\s ;\\|"]+[\\.\'][\\s ;\\|"]+|[\\s ;\\|"]+',    -- s.s|s
		$2,
		'g'
	), $2 )),
  1,
	CASE WHEN $3<=0 OR $3 IS NULL THEN char_length($1) ELSE $3 END
  );
$f$ LANGUAGE SQL IMMUTABLE;


CREATE FUNCTION stable.normalizeterm2(
	text,
	boolean DEFAULT true  -- cut
) RETURNS text AS $f$
   SELECT (  stable.normalizeterm(
          CASE WHEN $2 THEN substring($1 from '^[^\(\)\/;]+' ) ELSE $1 END,
	  ' ',
	  255,
          ' / '
   ));
$f$ LANGUAGE SQL IMMUTABLE;


CREATE or replace FUNCTION stable.name2lex(
	-- usar unaccent(x) e [^\w], antes convertendo D'Xx para Xx  e preservando data-iso com _x_
  p_name text
  ,p_normalize boolean DEFAULT true
  ,p_cut boolean DEFAULT true
	,p_flag boolean DEFAULT true -- unaccent flag
) RETURNS text AS $f$
	 SELECT CASE WHEN p_flag THEN urn ELSE urn END
	 FROM (
	   SELECT trim(replace(
		   regexp_replace(
			     CASE 
			      WHEN p_normalize AND p_flag THEN stable.normalizeterm2(unaccent($1),p_cut) 
			      WHEN p_normalize THEN stable.normalizeterm2($1,p_cut) 
			      ELSE $1 
			     END,
			     E' d[aeo] | d[oa]s | com | para |^d[aeo] | / .+| [aeo]s | [aeo] |[\-\' ]',
			     '.',
			     'g'
			   ),
			   '..',
		     '.'
		    ),'.')
		) t(urn)
$f$ LANGUAGE SQL IMMUTABLE;

Notas

Algoritmo para vincular via com lotes de nome semelhante.

CREATE MATERIALIZED VIEW vw_curitiba_lotes201807_merged1 AS
  SELECT * --, lado_medio_m::float/lado_medio_deg metric_factor 
  FROM (
    SELECT street_lexlabel, substr(st_geohash(geom),1,6) as geohash6, 
           count(*) n
           ,st_union(array_agg(st_buffer(geom,0.00001))) merged_geom -- dissolving limits
    FROM curitiba_lotes201807
    where street_lexlabel>'' 
    group by 2,1 order by 2,1
  ) t
  WHERE length(geohash6)=6 and n>3
; --  11574
CREATE MATERIALIZED VIEW vw_curitiba_vias_candidatas1 AS
  SELECT v.osm_id, ST_Distance(v.way,m.merged_geom) as dist, -- avg=0.013 graus
         v.lexlabel, m.street_lexlabel
    FROM vw_curitiba_vias v INNER JOIN vw_curitiba_lotes201807_merged1 m
      ON  ST_DWithin(m.merged_geom,v.way,0.06)   -- distância em graus: 0.1 a 0.0001, centro em 0.013
          AND levenshtein(v.lexlabel, m.street_lexlabel) < 5  -- nomes similares: 2 a 10
           -- perc= levenshtein(v.lexlabel, m.street_lexlabel)::float / greatest(length(v.lexlabel), length(m.street_lexlabel)) < 0.05
    WHERE v.osm_id NOT IN (
        select distinct unnest(osm_ids) osm_id  from vw_curitiba_vias_lexlabel_equiparadas
    )
; -- ~500 conforme parametros

-- avaliando
SELECT lexlabel, street_lexlabel , 
       count(*) n, 
       array[round(min(dist),4), round(avg(dist),4), round(max(dist),4)] dists 
FROM vw_curitiba_vias_candidatas1 group by 1,2
having count(*)>1
;
-- relatorio
COPY (
 SELECT lexlabel, street_lexlabel , 
       count(*) n, 
       array[round(min(dist),4), round(avg(dist),4), round(max(dist),4)] dists, array_agg(osm_id) osm_ids 
 FROM vw_curitiba_vias_candidatas1 group by 1,2
) TO '/tmp/curitiba_vias_candidatas.csv'  CSV HEADER;


COPY (
  select * from vw_curitiba_vias_lexlabel_equiparadas
) to '/tmp/curitiba_vias_equiparadas.csv' CSV HEADER;
--  314

-- usar st_buffer(via,0.002 a 0.008) nas vias que cruzam lotes de vw_curitiba_vias_lexlabel_equiparadas

Com essa estratégia resta eleger duas palavras menos frequentes em ordem alfabetica para determinar semelhança por proximidade: listar distâncias para firmar buffer.