Skip to content

Commit 2078210

Browse files
committed
new doc
1 parent 21aaf21 commit 2078210

File tree

6 files changed

+563
-0
lines changed

6 files changed

+563
-0
lines changed

201205/20120515_01.md

Lines changed: 393 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,393 @@
1+
## PostgreSQL汉字转拼音或拼音首字母的应用
2+
3+
### 作者
4+
digoal
5+
6+
### 日期
7+
2012-05-15
8+
9+
### 标签
10+
PostgreSQL , 拼音 , 汉字转拼音
11+
12+
----
13+
14+
## 背景
15+
能不能把在ORACLE里实现的汉字转拼音首字母的功能在PG里面实现以下。
16+
17+
比如股票软件,五粮液这个股,输入wly就可以出来了。
18+
19+
其实用到的就是汉字的拼音首字母,提高了用户体验。
20+
21+
在手机上打字比电脑上打字更慢,如果能实现这种类似的缩写可以大大提高用户体验,比如在搜索人名或者歌曲名,城市名时使用可以比较广泛。当然一般程序上就可以有类似的库可以调用,实现起来也比数据库效率要高。
22+
23+
我这里只是举个数据库实现的例子。
24+
25+
首先要找到汉字对应的拼音的码表,这个网上很多,我以gbk为例, 共计6727个汉字. 其他的编码如18030汉字更多,但是常用的GBK其实也够了.
26+
27+
1、汉字和拼音以及拼音首字母的对照表
28+
29+
```
30+
create table pinyin (hz varchar(1),py varchar(6),zm varchar(1));
31+
```
32+
33+
2、索引以及唯一约束
34+
35+
```
36+
create index idx_pinyin_hz on pinyin(hz);
37+
38+
create unique index idx_pinyin_hz_py on pinyin(hz,py);
39+
40+
-- create unique index idx_pinyin_hz_zm on pinyin(hz,zm); -- 这个可能不能唯一,例如多音字,可能首字母重复。
41+
```
42+
43+
3、未收录汉字将插入以下表
44+
45+
```
46+
create table new_discover (hz varchar(1) primary key,py varchar(6),zm varchar(1));
47+
```
48+
49+
4、码表的SQL详见附件
50+
51+
接下来创建字符串转拼音和拼音首字母的函数, 注意有些是多音字, 所以这个函数以数组的形式输出, 组合了所有的多音字.
52+
53+
仅支持UTF8编码,因为ascii函数转超过255的码时仅支持UTF8.
54+
55+
如果要通用,可以把ascii的判断去掉.把ascii码表写入pinyin表即可.
56+
57+
5、转换函数如下
58+
59+
```
60+
CREATE OR REPLACE FUNCTION get_py_zm(i_hz text) returns setof record AS $$
61+
DECLARE
62+
v_hz text;
63+
i int;
64+
v_sql1 text;
65+
v_sql2 text;
66+
v_sql3 text;
67+
v_sql4 text;
68+
v_sql text;
69+
v_max_id int;
70+
v_id int;
71+
BEGIN
72+
-- 存储字符串中每个汉字的拼音和拼音首字母的临时表
73+
CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS;
74+
-- 如果要在一个事务中多次调用这个函数, 需要先清除临时表的数据.
75+
truncate table tmp_get_py_zm;
76+
-- 把字符串拆分成一个个字符,并转成拼音和拼音首字母存储到临时表中.
77+
i := 0;
78+
for v_hz in select regexp_split_to_table(i_hz,'') loop
79+
if ascii(v_hz) > 255 then
80+
insert into tmp_get_py_zm select i,py,zm from pinyin where hz=v_hz;
81+
else
82+
insert into tmp_get_py_zm values(i,v_hz,v_hz);
83+
end if;
84+
if not found then
85+
perform 1 from new_discover where hz = v_hz;
86+
if not found then
87+
insert into new_discover(hz) values(v_hz);
88+
end if;
89+
return;
90+
end if;
91+
i := i+1;
92+
end loop;
93+
-- 拼装输出数组的SQL
94+
select max(id) into v_max_id from tmp_get_py_zm;
95+
if v_max_id > 0 then
96+
v_sql1 := '';
97+
v_sql3 := '';
98+
v_sql4 := '';
99+
v_id := 0;
100+
for v_id in select generate_series(0,v_max_id) loop
101+
if v_id <> v_max_id then
102+
v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id||',';
103+
v_sql3 := v_sql3||'t'||v_id||'.py::text||';
104+
v_sql4 := v_sql4||'t'||v_id||'.zm::text||';
105+
else
106+
v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id;
107+
v_sql3 := v_sql3||'t'||v_id||'.py::text';
108+
v_sql4 := v_sql4||'t'||v_id||'.zm::text';
109+
v_sql := 'select array_agg('||v_sql3||'),array_agg('||v_sql4||') from '||v_sql1;
110+
end if;
111+
end loop;
112+
else
113+
v_sql := 'select array_agg(py::text),array_agg(zm::text) from tmp_get_py_zm';
114+
end if;
115+
-- 输出数组
116+
return query execute v_sql;
117+
return;
118+
END;
119+
$$ language plpgsql;
120+
```
121+
122+
测试, 长是多音字, 所以输出了两个元素的数组
123+
124+
```
125+
digoal=# select * from get_py_zm('你好a 长安, 我很好!') as (c1 text[],c2 text[]);
126+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
127+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
128+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
129+
c1 | c2
130+
-----------------------------------------------------------+---------------------------------
131+
{"nihaoa changan, wohenhao!","nihaoa zhangan, wohenhao!"} | {"nha ca, whh!","nha za, whh!"}
132+
```
133+
134+
测试无多音字的情况
135+
136+
```
137+
digoal=# select * from get_py_zm('刘德华') as (c1 text[],c2 text[]);
138+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
139+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
140+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
141+
c1 | c2
142+
------------+-------
143+
{liudehua} | {ldh}
144+
145+
digoal=# select * from get_py_zm('五粮液') as (c1 text[],c2 text[]);
146+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
147+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
148+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
149+
c1 | c2
150+
-------------+-------
151+
{wuliangye} | {wly}
152+
153+
digoal=# select * from get_py_zm('五粮液 茅台') as (c1 text[],c2 text[]);
154+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
155+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
156+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
157+
c1 | c2
158+
----------------------+------------
159+
{"wuliangye maotai"} | {"wly mt"}
160+
```
161+
162+
输入一个码表中不存在的汉字
163+
164+
```
165+
digoal=# select * from get_py_zm('爲') as (c1 text[],c2 text[]);
166+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
167+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
168+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
169+
c1 | c2
170+
----+----
171+
(0 rows)
172+
```
173+
174+
在码表中不存在的汉字将记录到new_discover 表
175+
176+
```
177+
digoal=# select * from new_discover ;
178+
hz | py | zm
179+
----+----+----
180+
爲 | |
181+
```
182+
183+
码表中存在和不存在的汉字混合使用时也无输出
184+
185+
```
186+
digoal=# select * from get_py_zm('这是繁體') as (c1 text[],c2 text[]);
187+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
188+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
189+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
190+
c1 | c2
191+
----+----
192+
(0 rows)
193+
194+
digoal=# select * from new_discover ;
195+
hz | py | zm
196+
----+----+----
197+
爲 | |
198+
體 | |
199+
```
200+
201+
所以在发现new_discover中有记录后应该尽快完善到码表中.
202+
203+
例如
204+
205+
```
206+
digoal=# begin;
207+
BEGIN
208+
digoal=# insert into pinyin values('爲','wei','w');
209+
INSERT 0 1
210+
digoal=# insert into pinyin values('體','ti','t');
211+
INSERT 0 1
212+
digoal=# delete from new_discover where hz in ('體','爲');
213+
DELETE 2
214+
digoal=# end;
215+
COMMIT
216+
```
217+
218+
再次执行以上查询时有返回结果
219+
220+
```
221+
digoal=# select * from get_py_zm('这是繁體') as (c1 text[],c2 text[]);
222+
NOTICE: relation "tmp_get_py_zm" already exists, skipping
223+
CONTEXT: SQL statement "CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS"
224+
PL/pgSQL function "get_py_zm" line 13 at SQL statement
225+
c1 | c2
226+
---------------+--------
227+
{zheshifanti} | {zsft}
228+
```
229+
230+
由于汉字在码表中不存在时, 将无转换输出, 所以需要改进一下.
231+
232+
6、改进, 码表中没有的使用问号替代.
233+
234+
创建返回类型以及改进函数, 由于函数输入参数类型一致, 返回结果不一致, 需要将原函数删除再创建
235+
236+
```
237+
CREATE TYPE t_py_zm as (c1 text[],c2 text[]);
238+
CREATE OR REPLACE FUNCTION get_py_zm(i_hz text) returns setof t_py_zm AS $$
239+
DECLARE
240+
v_hz text;
241+
i int;
242+
v_sql1 text;
243+
v_sql2 text;
244+
v_sql3 text;
245+
v_sql4 text;
246+
v_sql text;
247+
v_max_id int;
248+
v_id int;
249+
BEGIN
250+
set client_min_messages = warning;
251+
CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS;
252+
truncate table tmp_get_py_zm;
253+
i := 0;
254+
for v_hz in select regexp_split_to_table(i_hz,'') loop
255+
if ascii(v_hz) > 255 then
256+
insert into tmp_get_py_zm select i,py,zm from pinyin where hz=v_hz;
257+
else
258+
insert into tmp_get_py_zm values(i,v_hz,v_hz);
259+
end if;
260+
if not found then
261+
perform 1 from new_discover where hz = v_hz;
262+
if not found then
263+
insert into new_discover(hz) values(v_hz);
264+
end if;
265+
insert into tmp_get_py_zm values(i,'?','?');
266+
end if;
267+
i := i+1;
268+
end loop;
269+
270+
select max(id) into v_max_id from tmp_get_py_zm;
271+
if v_max_id > 0 then
272+
v_sql1 := '';
273+
v_sql3 := '';
274+
v_sql4 := '';
275+
v_id := 0;
276+
for v_id in select generate_series(0,v_max_id) loop
277+
if v_id <> v_max_id then
278+
v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id||',';
279+
v_sql3 := v_sql3||'t'||v_id||'.py::text||';
280+
v_sql4 := v_sql4||'t'||v_id||'.zm::text||';
281+
else
282+
v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id;
283+
v_sql3 := v_sql3||'t'||v_id||'.py::text';
284+
v_sql4 := v_sql4||'t'||v_id||'.zm::text';
285+
v_sql := 'select array_agg('||v_sql3||'),array_agg('||v_sql4||') from '||v_sql1;
286+
end if;
287+
end loop;
288+
else
289+
v_sql := 'select array_agg(py::text),array_agg(zm::text) from tmp_get_py_zm';
290+
end if;
291+
292+
return query execute v_sql;
293+
return;
294+
END;
295+
$$ language plpgsql;
296+
```
297+
298+
299+
修改后的测试, 點選两个汉字在码表中不存在. 将以问号代替.
300+
301+
```
302+
digoal=# select * from get_py_zm('这是繁體,點選');
303+
c1 | c2
304+
--------------------+-------------
305+
{"zheshifanti,??"} | {"zsft,??"}
306+
```
307+
308+
以问号代替不存在的汉字并插入new_discover表
309+
310+
```
311+
digoal=# select * from new_discover ;
312+
hz | py | zm
313+
----+----+----
314+
點 | |
315+
選 | |
316+
```
317+
318+
接下来测试可能使用的场景
319+
320+
创建包含歌手,歌曲信息的表.
321+
322+
```
323+
create table music (id serial primary key, song text, singer text, songpy text[], songzm text[], singerpy text[], singerzm text[]);
324+
```
325+
326+
插入一条测试数据
327+
328+
```
329+
insert into music(song,singer,songpy,songzm,singerpy,singerzm) select '刘德华','中国人',t1.c1,t1.c2,t2.c1,t2.c2 from get_py_zm('刘德华') t1,get_py_zm('中国人') t2;
330+
```
331+
332+
查看转码是否成功
333+
334+
```
335+
digoal=# select * from music;
336+
id | song | singer | songpy | songzm | singerpy | singerzm
337+
----+--------+--------+------------+--------+---------------+----------
338+
1 | 刘德华 | 中国人 | {liudehua} | {ldh} | {zhongguoren} | {zgr}
339+
(1 row)
340+
```
341+
342+
插入10W条测试记录
343+
344+
```
345+
digoal=# insert into music(song,singer,songpy,songzm,singerpy,singerzm)
346+
select t.info, t.info, (t.t1).c1, (t.t1).c2, (t.t2).c1, (t.t2).c2
347+
from
348+
(select t.info,get_py_zm(t.info) t1,get_py_zm(t.info) t2
349+
from
350+
(select '刘德华'||generate_series(1,100000) as info) as t
351+
)
352+
as t;
353+
```
354+
355+
创建索引, 加快按字母检索的速度
356+
357+
```
358+
digoal=# create index idx_music_songpy on music using gin (songpy);
359+
CREATE INDEX
360+
digoal=# create index idx_music_songzm on music using gin (songzm);
361+
CREATE INDEX
362+
digoal=# create index idx_music_singerpy on music using gin (singerpy);
363+
CREATE INDEX
364+
digoal=# create index idx_music_singerzm on music using gin (singerzm);
365+
CREATE INDEX
366+
```
367+
368+
检索测试
369+
370+
```
371+
digoal=# explain analyze select * from music where singerzm @> array['ldh1'];
372+
QUERY PLAN
373+
-----------------------------------------------------------------------------------------------------------------------------
374+
Bitmap Heap Scan on music (cost=8.78..188.78 rows=100 width=176) (actual time=0.024..0.024 rows=1 loops=1)
375+
Recheck Cond: (singerzm @> '{ldh1}'::text[])
376+
-> Bitmap Index Scan on idx_music_singerzm (cost=0.00..8.75 rows=100 width=0) (actual time=0.018..0.018 rows=1 loops=1)
377+
Index Cond: (singerzm @> '{ldh1}'::text[])
378+
Total runtime: 0.050 ms
379+
(5 rows)
380+
381+
Time: 0.396 ms
382+
digoal=# select * from music where singerzm @> array['ldh1'];
383+
id | song | singer | songpy | songzm | singerpy | singerzm
384+
-------+---------+---------+-------------+--------+-------------+----------
385+
40947 | 刘德华1 | 刘德华1 | {liudehua1} | {ldh1} | {liudehua1} | {ldh1}
386+
(1 row)
387+
388+
Time: 0.308 ms
389+
```
390+
391+
速度还不错.
392+
393+
[汉字拼音对照文件](20120515_01_rar_001.rar)

201205/20120515_01_rar_001.rar

114 KB
Binary file not shown.

201205/readme.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@
88
##### 20120518_01.md [《PostgreSQL 9.2 add array elements statistics》](20120518_01.md)
99
##### 20120517_01.md [《PostgreSQL 黑科技 range 类型及 gist index 助力物联网(IoT)》](20120517_01.md)
1010
##### 20120516_01.md [《PostgreSQL 共享事务快照功能 - PostgreSQL 9.2 can share snapshot between multi transactions》](20120516_01.md)
11+
##### 20120515_01.md [《PostgreSQL汉字转拼音或拼音首字母的应用》](20120515_01.md)
1112
##### 20120511_01.md [《PostgreSQL 逻辑备份一致性讲解 - Why pg_dump backup a database in consistent status》](20120511_01.md)

0 commit comments

Comments
 (0)