|
| 1 | +## PostgreSQL汉字转拼音或拼音首字母的应用 |
| 2 | + |
| 3 | +### 作者 |
| 4 | +digoal |
| 5 | + |
| 6 | +### 日期 |
| 7 | +2012-05-15 |
| 8 | + |
| 9 | +### 标签 |
| 10 | +PostgreSQL , 拼音 , 汉字转拼音 |
| 11 | + |
| 12 | +---- |
| 13 | + |
| 14 | +## 背景 |
| 15 | +能不能把在ORACLE里实现的汉字转拼音首字母的功能在PG里面实现以下。 |
| 16 | + |
| 17 | +比如股票软件,五粮液这个股,输入wly就可以出来了。 |
| 18 | + |
| 19 | +其实用到的就是汉字的拼音首字母,提高了用户体验。 |
| 20 | + |
| 21 | +在手机上打字比电脑上打字更慢,如果能实现这种类似的缩写可以大大提高用户体验,比如在搜索人名或者歌曲名,城市名时使用可以比较广泛。当然一般程序上就可以有类似的库可以调用,实现起来也比数据库效率要高。 |
| 22 | + |
| 23 | +我这里只是举个数据库实现的例子。 |
| 24 | + |
| 25 | +首先要找到汉字对应的拼音的码表,这个网上很多,我以gbk为例, 共计6727个汉字. 其他的编码如18030汉字更多,但是常用的GBK其实也够了. |
| 26 | + |
| 27 | +1、汉字和拼音以及拼音首字母的对照表 |
| 28 | + |
| 29 | +``` |
| 30 | +create table pinyin (hz varchar(1),py varchar(6),zm varchar(1)); |
| 31 | +``` |
| 32 | + |
| 33 | +2、索引以及唯一约束 |
| 34 | + |
| 35 | +``` |
| 36 | +create index idx_pinyin_hz on pinyin(hz); |
| 37 | + |
| 38 | +create unique index idx_pinyin_hz_py on pinyin(hz,py); |
| 39 | + |
| 40 | +-- create unique index idx_pinyin_hz_zm on pinyin(hz,zm); -- 这个可能不能唯一,例如多音字,可能首字母重复。 |
| 41 | +``` |
| 42 | + |
| 43 | +3、未收录汉字将插入以下表 |
| 44 | + |
| 45 | +``` |
| 46 | +create table new_discover (hz varchar(1) primary key,py varchar(6),zm varchar(1)); |
| 47 | +``` |
| 48 | + |
| 49 | +4、码表的SQL详见附件 |
| 50 | + |
| 51 | +接下来创建字符串转拼音和拼音首字母的函数, 注意有些是多音字, 所以这个函数以数组的形式输出, 组合了所有的多音字. |
| 52 | + |
| 53 | +仅支持UTF8编码,因为ascii函数转超过255的码时仅支持UTF8. |
| 54 | + |
| 55 | +如果要通用,可以把ascii的判断去掉.把ascii码表写入pinyin表即可. |
| 56 | + |
| 57 | +5、转换函数如下 |
| 58 | + |
| 59 | +``` |
| 60 | +CREATE OR REPLACE FUNCTION get_py_zm(i_hz text) returns setof record AS $$ |
| 61 | +DECLARE |
| 62 | +v_hz text; |
| 63 | +i int; |
| 64 | +v_sql1 text; |
| 65 | +v_sql2 text; |
| 66 | +v_sql3 text; |
| 67 | +v_sql4 text; |
| 68 | +v_sql text; |
| 69 | +v_max_id int; |
| 70 | +v_id int; |
| 71 | +BEGIN |
| 72 | +-- 存储字符串中每个汉字的拼音和拼音首字母的临时表 |
| 73 | +CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS; |
| 74 | +-- 如果要在一个事务中多次调用这个函数, 需要先清除临时表的数据. |
| 75 | +truncate table tmp_get_py_zm; |
| 76 | +-- 把字符串拆分成一个个字符,并转成拼音和拼音首字母存储到临时表中. |
| 77 | +i := 0; |
| 78 | +for v_hz in select regexp_split_to_table(i_hz,'') loop |
| 79 | + if ascii(v_hz) > 255 then |
| 80 | + insert into tmp_get_py_zm select i,py,zm from pinyin where hz=v_hz; |
| 81 | + else |
| 82 | + insert into tmp_get_py_zm values(i,v_hz,v_hz); |
| 83 | + end if; |
| 84 | + if not found then |
| 85 | + perform 1 from new_discover where hz = v_hz; |
| 86 | + if not found then |
| 87 | + insert into new_discover(hz) values(v_hz); |
| 88 | + end if; |
| 89 | + return; |
| 90 | + end if; |
| 91 | + i := i+1; |
| 92 | +end loop; |
| 93 | +-- 拼装输出数组的SQL |
| 94 | +select max(id) into v_max_id from tmp_get_py_zm; |
| 95 | +if v_max_id > 0 then |
| 96 | + v_sql1 := ''; |
| 97 | + v_sql3 := ''; |
| 98 | + v_sql4 := ''; |
| 99 | + v_id := 0; |
| 100 | + for v_id in select generate_series(0,v_max_id) loop |
| 101 | + if v_id <> v_max_id then |
| 102 | + v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id||','; |
| 103 | + v_sql3 := v_sql3||'t'||v_id||'.py::text||'; |
| 104 | + v_sql4 := v_sql4||'t'||v_id||'.zm::text||'; |
| 105 | + else |
| 106 | + v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id; |
| 107 | + v_sql3 := v_sql3||'t'||v_id||'.py::text'; |
| 108 | + v_sql4 := v_sql4||'t'||v_id||'.zm::text'; |
| 109 | + v_sql := 'select array_agg('||v_sql3||'),array_agg('||v_sql4||') from '||v_sql1; |
| 110 | + end if; |
| 111 | + end loop; |
| 112 | +else |
| 113 | + v_sql := 'select array_agg(py::text),array_agg(zm::text) from tmp_get_py_zm'; |
| 114 | +end if; |
| 115 | +-- 输出数组 |
| 116 | +return query execute v_sql; |
| 117 | +return; |
| 118 | +END; |
| 119 | +$$ language plpgsql; |
| 120 | +``` |
| 121 | + |
| 122 | +测试, 长是多音字, 所以输出了两个元素的数组 |
| 123 | + |
| 124 | +``` |
| 125 | +digoal=# select * from get_py_zm('你好a 长安, 我很好!') as (c1 text[],c2 text[]); |
| 126 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 127 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 128 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 129 | + c1 | c2 |
| 130 | +-----------------------------------------------------------+--------------------------------- |
| 131 | + {"nihaoa changan, wohenhao!","nihaoa zhangan, wohenhao!"} | {"nha ca, whh!","nha za, whh!"} |
| 132 | +``` |
| 133 | + |
| 134 | +测试无多音字的情况 |
| 135 | + |
| 136 | +``` |
| 137 | +digoal=# select * from get_py_zm('刘德华') as (c1 text[],c2 text[]); |
| 138 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 139 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 140 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 141 | + c1 | c2 |
| 142 | +------------+------- |
| 143 | + {liudehua} | {ldh} |
| 144 | + |
| 145 | +digoal=# select * from get_py_zm('五粮液') as (c1 text[],c2 text[]); |
| 146 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 147 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 148 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 149 | + c1 | c2 |
| 150 | +-------------+------- |
| 151 | + {wuliangye} | {wly} |
| 152 | + |
| 153 | +digoal=# select * from get_py_zm('五粮液 茅台') as (c1 text[],c2 text[]); |
| 154 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 155 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 156 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 157 | + c1 | c2 |
| 158 | +----------------------+------------ |
| 159 | + {"wuliangye maotai"} | {"wly mt"} |
| 160 | +``` |
| 161 | + |
| 162 | +输入一个码表中不存在的汉字 |
| 163 | + |
| 164 | +``` |
| 165 | +digoal=# select * from get_py_zm('爲') as (c1 text[],c2 text[]); |
| 166 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 167 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 168 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 169 | + c1 | c2 |
| 170 | +----+---- |
| 171 | +(0 rows) |
| 172 | +``` |
| 173 | + |
| 174 | +在码表中不存在的汉字将记录到new_discover 表 |
| 175 | + |
| 176 | +``` |
| 177 | +digoal=# select * from new_discover ; |
| 178 | + hz | py | zm |
| 179 | +----+----+---- |
| 180 | + 爲 | | |
| 181 | +``` |
| 182 | + |
| 183 | +码表中存在和不存在的汉字混合使用时也无输出 |
| 184 | + |
| 185 | +``` |
| 186 | +digoal=# select * from get_py_zm('这是繁體') as (c1 text[],c2 text[]); |
| 187 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 188 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 189 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 190 | + c1 | c2 |
| 191 | +----+---- |
| 192 | +(0 rows) |
| 193 | + |
| 194 | +digoal=# select * from new_discover ; |
| 195 | + hz | py | zm |
| 196 | +----+----+---- |
| 197 | + 爲 | | |
| 198 | + 體 | | |
| 199 | +``` |
| 200 | + |
| 201 | +所以在发现new_discover中有记录后应该尽快完善到码表中. |
| 202 | + |
| 203 | +例如 |
| 204 | + |
| 205 | +``` |
| 206 | +digoal=# begin; |
| 207 | +BEGIN |
| 208 | +digoal=# insert into pinyin values('爲','wei','w'); |
| 209 | +INSERT 0 1 |
| 210 | +digoal=# insert into pinyin values('體','ti','t'); |
| 211 | +INSERT 0 1 |
| 212 | +digoal=# delete from new_discover where hz in ('體','爲'); |
| 213 | +DELETE 2 |
| 214 | +digoal=# end; |
| 215 | +COMMIT |
| 216 | +``` |
| 217 | + |
| 218 | +再次执行以上查询时有返回结果 |
| 219 | + |
| 220 | +``` |
| 221 | +digoal=# select * from get_py_zm('这是繁體') as (c1 text[],c2 text[]); |
| 222 | +NOTICE: relation "tmp_get_py_zm" already exists, skipping |
| 223 | +CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS" |
| 224 | +PL/pgSQL function "get_py_zm" line 13 at SQL statement |
| 225 | + c1 | c2 |
| 226 | +---------------+-------- |
| 227 | + {zheshifanti} | {zsft} |
| 228 | +``` |
| 229 | + |
| 230 | +由于汉字在码表中不存在时, 将无转换输出, 所以需要改进一下. |
| 231 | + |
| 232 | +6、改进, 码表中没有的使用问号替代. |
| 233 | + |
| 234 | +创建返回类型以及改进函数, 由于函数输入参数类型一致, 返回结果不一致, 需要将原函数删除再创建 |
| 235 | + |
| 236 | +``` |
| 237 | +CREATE TYPE t_py_zm as (c1 text[],c2 text[]); |
| 238 | +CREATE OR REPLACE FUNCTION get_py_zm(i_hz text) returns setof t_py_zm AS $$ |
| 239 | +DECLARE |
| 240 | +v_hz text; |
| 241 | +i int; |
| 242 | +v_sql1 text; |
| 243 | +v_sql2 text; |
| 244 | +v_sql3 text; |
| 245 | +v_sql4 text; |
| 246 | +v_sql text; |
| 247 | +v_max_id int; |
| 248 | +v_id int; |
| 249 | +BEGIN |
| 250 | +set client_min_messages = warning; |
| 251 | +CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS; |
| 252 | +truncate table tmp_get_py_zm; |
| 253 | +i := 0; |
| 254 | +for v_hz in select regexp_split_to_table(i_hz,'') loop |
| 255 | + if ascii(v_hz) > 255 then |
| 256 | + insert into tmp_get_py_zm select i,py,zm from pinyin where hz=v_hz; |
| 257 | + else |
| 258 | + insert into tmp_get_py_zm values(i,v_hz,v_hz); |
| 259 | + end if; |
| 260 | + if not found then |
| 261 | + perform 1 from new_discover where hz = v_hz; |
| 262 | + if not found then |
| 263 | + insert into new_discover(hz) values(v_hz); |
| 264 | + end if; |
| 265 | + insert into tmp_get_py_zm values(i,'?','?'); |
| 266 | + end if; |
| 267 | + i := i+1; |
| 268 | +end loop; |
| 269 | + |
| 270 | +select max(id) into v_max_id from tmp_get_py_zm; |
| 271 | +if v_max_id > 0 then |
| 272 | + v_sql1 := ''; |
| 273 | + v_sql3 := ''; |
| 274 | + v_sql4 := ''; |
| 275 | + v_id := 0; |
| 276 | + for v_id in select generate_series(0,v_max_id) loop |
| 277 | + if v_id <> v_max_id then |
| 278 | + v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id||','; |
| 279 | + v_sql3 := v_sql3||'t'||v_id||'.py::text||'; |
| 280 | + v_sql4 := v_sql4||'t'||v_id||'.zm::text||'; |
| 281 | + else |
| 282 | + v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id; |
| 283 | + v_sql3 := v_sql3||'t'||v_id||'.py::text'; |
| 284 | + v_sql4 := v_sql4||'t'||v_id||'.zm::text'; |
| 285 | + v_sql := 'select array_agg('||v_sql3||'),array_agg('||v_sql4||') from '||v_sql1; |
| 286 | + end if; |
| 287 | + end loop; |
| 288 | +else |
| 289 | + v_sql := 'select array_agg(py::text),array_agg(zm::text) from tmp_get_py_zm'; |
| 290 | +end if; |
| 291 | + |
| 292 | +return query execute v_sql; |
| 293 | +return; |
| 294 | +END; |
| 295 | +$$ language plpgsql; |
| 296 | +``` |
| 297 | + |
| 298 | + |
| 299 | +修改后的测试, 點選两个汉字在码表中不存在. 将以问号代替. |
| 300 | + |
| 301 | +``` |
| 302 | +digoal=# select * from get_py_zm('这是繁體,點選'); |
| 303 | + c1 | c2 |
| 304 | +--------------------+------------- |
| 305 | + {"zheshifanti,??"} | {"zsft,??"} |
| 306 | +``` |
| 307 | + |
| 308 | +以问号代替不存在的汉字并插入new_discover表 |
| 309 | + |
| 310 | +``` |
| 311 | +digoal=# select * from new_discover ; |
| 312 | + hz | py | zm |
| 313 | +----+----+---- |
| 314 | + 點 | | |
| 315 | + 選 | | |
| 316 | +``` |
| 317 | + |
| 318 | +接下来测试可能使用的场景 |
| 319 | + |
| 320 | +创建包含歌手,歌曲信息的表. |
| 321 | + |
| 322 | +``` |
| 323 | +create table music (id serial primary key, song text, singer text, songpy text[], songzm text[], singerpy text[], singerzm text[]); |
| 324 | +``` |
| 325 | + |
| 326 | +插入一条测试数据 |
| 327 | + |
| 328 | +``` |
| 329 | +insert into music(song,singer,songpy,songzm,singerpy,singerzm) select '刘德华','中国人',t1.c1,t1.c2,t2.c1,t2.c2 from get_py_zm('刘德华') t1,get_py_zm('中国人') t2; |
| 330 | +``` |
| 331 | + |
| 332 | +查看转码是否成功 |
| 333 | + |
| 334 | +``` |
| 335 | +digoal=# select * from music; |
| 336 | + id | song | singer | songpy | songzm | singerpy | singerzm |
| 337 | +----+--------+--------+------------+--------+---------------+---------- |
| 338 | + 1 | 刘德华 | 中国人 | {liudehua} | {ldh} | {zhongguoren} | {zgr} |
| 339 | +(1 row) |
| 340 | +``` |
| 341 | + |
| 342 | +插入10W条测试记录 |
| 343 | + |
| 344 | +``` |
| 345 | +digoal=# insert into music(song,singer,songpy,songzm,singerpy,singerzm) |
| 346 | +select t.info, t.info, (t.t1).c1, (t.t1).c2, (t.t2).c1, (t.t2).c2 |
| 347 | +from |
| 348 | + (select t.info,get_py_zm(t.info) t1,get_py_zm(t.info) t2 |
| 349 | + from |
| 350 | + (select '刘德华'||generate_series(1,100000) as info) as t |
| 351 | + ) |
| 352 | +as t; |
| 353 | +``` |
| 354 | + |
| 355 | +创建索引, 加快按字母检索的速度 |
| 356 | + |
| 357 | +``` |
| 358 | +digoal=# create index idx_music_songpy on music using gin (songpy); |
| 359 | +CREATE INDEX |
| 360 | +digoal=# create index idx_music_songzm on music using gin (songzm); |
| 361 | +CREATE INDEX |
| 362 | +digoal=# create index idx_music_singerpy on music using gin (singerpy); |
| 363 | +CREATE INDEX |
| 364 | +digoal=# create index idx_music_singerzm on music using gin (singerzm); |
| 365 | +CREATE INDEX |
| 366 | +``` |
| 367 | + |
| 368 | +检索测试 |
| 369 | + |
| 370 | +``` |
| 371 | +digoal=# explain analyze select * from music where singerzm @> array['ldh1']; |
| 372 | + QUERY PLAN |
| 373 | +----------------------------------------------------------------------------------------------------------------------------- |
| 374 | + Bitmap Heap Scan on music (cost=8.78..188.78 rows=100 width=176) (actual time=0.024..0.024 rows=1 loops=1) |
| 375 | + Recheck Cond: (singerzm @> '{ldh1}'::text[]) |
| 376 | + -> Bitmap Index Scan on idx_music_singerzm (cost=0.00..8.75 rows=100 width=0) (actual time=0.018..0.018 rows=1 loops=1) |
| 377 | + Index Cond: (singerzm @> '{ldh1}'::text[]) |
| 378 | + Total runtime: 0.050 ms |
| 379 | +(5 rows) |
| 380 | + |
| 381 | +Time: 0.396 ms |
| 382 | +digoal=# select * from music where singerzm @> array['ldh1']; |
| 383 | + id | song | singer | songpy | songzm | singerpy | singerzm |
| 384 | +-------+---------+---------+-------------+--------+-------------+---------- |
| 385 | + 40947 | 刘德华1 | 刘德华1 | {liudehua1} | {ldh1} | {liudehua1} | {ldh1} |
| 386 | +(1 row) |
| 387 | + |
| 388 | +Time: 0.308 ms |
| 389 | +``` |
| 390 | + |
| 391 | +速度还不错. |
| 392 | + |
| 393 | +[汉字拼音对照文件](20120515_01_rar_001.rar) |
0 commit comments