Skip to content

Commit f63c423

Browse files
Revamp age csv loader (#2044)
* Allow 0 as entry_id - No regression test were impacted by this change. * Use batch inserts to improve performance - Changed heap_insert to heap_multi_insert since it is faster than calling heap_insert() in a loop. When multiple tuples can be inserted on a single page, just a single WAL record covering all of them, and only need to lock/unlock the page once. - BATCH_SIZE is set to 1000, which is the number of tuples to insert in a single batch. This number was chosen after some experimentation. - Change some of the field names to avoid confusion. * Use sequence for generating ids for edge and vertex - Sequence is not used if the id_field_exists is true in load_labels_from_file function, since the entry id is present in the csv. * Add function to create temporary table for ids - Created a temporary table and populate it with already generated vertex ids. A unique index is created on id column to ensure that new ids generated (using entry id from csv) are unique. * Insert generated ids in the temporary table to enforce uniqueness - Insert ids in the temporary table and also update index to enforce uniqueness. - If the entry id provided in the CSV is greater than the current sequence value, the sequence value is updated to match the entry ID. For example: Suppose the current sequence value is 1, and the CSV entry ID is 2. If we use 2 but not update the sequence to 2, next time the CREATE clause is used, 2 will be returned by sequence as an entry id, resulting in duplicate. - Update batch functions * Add functions to create graph and label automatically - These functions will check existence of graph and label, and create them if they don't exist. * Add regression tests
1 parent 57b67c0 commit f63c423

File tree

13 files changed

+749
-110
lines changed

13 files changed

+749
-110
lines changed

regress/expected/age_load.out

+108-36
Original file line numberDiff line numberDiff line change
@@ -19,41 +19,87 @@
1919
\! cp -r regress/age_load/data regress/instance/data/age_load
2020
LOAD 'age';
2121
SET search_path TO ag_catalog;
22+
-- Create a country using CREATE clause
2223
SELECT create_graph('agload_test_graph');
2324
NOTICE: graph "agload_test_graph" has been created
2425
create_graph
2526
--------------
2627

2728
(1 row)
2829

29-
SELECT create_vlabel('agload_test_graph','Country');
30-
NOTICE: VLabel "Country" has been created
31-
create_vlabel
32-
---------------
33-
30+
SELECT * FROM cypher('agload_test_graph', $$CREATE (n:Country {__id__:1}) RETURN n$$) as (n agtype);
31+
n
32+
----------------------------------------------------------------------------------
33+
{"id": 844424930131969, "label": "Country", "properties": {"__id__": 1}}::vertex
3434
(1 row)
3535

36+
--
37+
-- Load countries with id
38+
--
3639
SELECT load_labels_from_file('agload_test_graph', 'Country',
37-
'age_load/countries.csv');
40+
'age_load/countries.csv', true);
3841
load_labels_from_file
3942
-----------------------
4043

4144
(1 row)
4245

43-
SELECT create_vlabel('agload_test_graph','City');
44-
NOTICE: VLabel "City" has been created
45-
create_vlabel
46-
---------------
47-
46+
-- A temporary table should have been created with 54 ids; 1 from CREATE and 53 from file
47+
SELECT COUNT(*)=54 FROM "_agload_test_graph_ag_vertex_ids";
48+
?column?
49+
----------
50+
t
51+
(1 row)
52+
53+
-- Sequence should be equal to max entry id i.e. 248
54+
SELECT currval('agload_test_graph."Country_id_seq"')=248;
55+
?column?
56+
----------
57+
t
4858
(1 row)
4959

60+
-- Should error out on loading the same file again due to duplicate id
61+
SELECT load_labels_from_file('agload_test_graph', 'Country',
62+
'age_load/countries.csv', true);
63+
ERROR: Cannot insert duplicate vertex id: 844424930131970
64+
HINT: Entry id 2 is already used
65+
--
66+
-- Load cities with id
67+
--
68+
-- Should create City label automatically and load cities
5069
SELECT load_labels_from_file('agload_test_graph', 'City',
51-
'age_load/cities.csv');
70+
'age_load/cities.csv', true);
71+
NOTICE: VLabel "City" has been created
5272
load_labels_from_file
5373
-----------------------
5474

5575
(1 row)
5676

77+
-- Temporary table should have 54+72485 rows now
78+
SELECT COUNT(*)=54+72485 FROM "_agload_test_graph_ag_vertex_ids";
79+
?column?
80+
----------
81+
t
82+
(1 row)
83+
84+
-- Sequence should be equal to max entry id i.e. 146941
85+
SELECT currval('agload_test_graph."City_id_seq"')=146941;
86+
?column?
87+
----------
88+
t
89+
(1 row)
90+
91+
-- Should error out on loading the same file again due to duplicate id
92+
SELECT load_labels_from_file('agload_test_graph', 'City',
93+
'age_load/cities.csv', true);
94+
ERROR: Cannot insert duplicate vertex id: 1125899906842777
95+
HINT: Entry id 153 is already used
96+
--
97+
-- Load edges -- Connects cities to countries
98+
--
99+
-- Should error out for using vertex label
100+
SELECT load_edges_from_file('agload_test_graph', 'Country',
101+
'age_load/edges.csv');
102+
ERROR: label "Country" already exists as edge label
57103
SELECT create_elabel('agload_test_graph','has_city');
58104
NOTICE: ELabel "has_city" has been created
59105
create_elabel
@@ -68,6 +114,17 @@ SELECT load_edges_from_file('agload_test_graph', 'has_city',
68114

69115
(1 row)
70116

117+
-- Sequence should be equal to number of edges loaded i.e. 72485
118+
SELECT currval('agload_test_graph."has_city_id_seq"')=72485;
119+
?column?
120+
----------
121+
t
122+
(1 row)
123+
124+
-- Should error out for using edge label
125+
SELECT load_labels_from_file('agload_test_graph', 'has_city',
126+
'age_load/cities.csv');
127+
ERROR: label "has_city" already exists as vertex label
71128
SELECT table_catalog, table_schema, lower(table_name) as table_name, table_type
72129
FROM information_schema.tables
73130
WHERE table_schema = 'agload_test_graph' ORDER BY table_name ASC;
@@ -83,7 +140,7 @@ WHERE table_schema = 'agload_test_graph' ORDER BY table_name ASC;
83140
SELECT COUNT(*) FROM agload_test_graph."Country";
84141
count
85142
-------
86-
53
143+
54
87144
(1 row)
88145

89146
SELECT COUNT(*) FROM agload_test_graph."City";
@@ -101,7 +158,7 @@ SELECT COUNT(*) FROM agload_test_graph."has_city";
101158
SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH(n) RETURN n$$) as (n agtype);
102159
count
103160
-------
104-
72538
161+
72539
105162
(1 row)
106163

107164
SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH (a)-[e]->(b) RETURN e$$) as (n agtype);
@@ -110,6 +167,17 @@ SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH (a)-[e]->(b) RETURN e$$
110167
72485
111168
(1 row)
112169

170+
--
171+
-- Load countries and cities without id
172+
--
173+
-- Should load countries in Country label without error since it should use sequence now
174+
SELECT load_labels_from_file('agload_test_graph', 'Country',
175+
'age_load/countries.csv', false);
176+
load_labels_from_file
177+
-----------------------
178+
179+
(1 row)
180+
113181
SELECT create_vlabel('agload_test_graph','Country2');
114182
NOTICE: VLabel "Country2" has been created
115183
create_vlabel
@@ -153,6 +221,7 @@ SELECT COUNT(*) FROM agload_test_graph."City2";
153221
SELECT id FROM agload_test_graph."Country" LIMIT 10;
154222
id
155223
-----------------
224+
844424930131969
156225
844424930131970
157226
844424930131971
158227
844424930131974
@@ -162,7 +231,6 @@ SELECT id FROM agload_test_graph."Country" LIMIT 10;
162231
844424930131996
163232
844424930132002
164233
844424930132023
165-
844424930132025
166234
(10 rows)
167235

168236
SELECT id FROM agload_test_graph."Country2" LIMIT 10;
@@ -180,42 +248,57 @@ SELECT id FROM agload_test_graph."Country2" LIMIT 10;
180248
1688849860263946
181249
(10 rows)
182250

251+
-- Should return 2 rows for Country with same properties, but different ids
183252
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'BE'})
184253
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
185254
id(n) | n.name | n.iso2
186255
-----------------+-----------+--------
187256
844424930131990 | "Belgium" | "BE"
188-
(1 row)
257+
844424930132223 | "Belgium" | "BE"
258+
(2 rows)
189259

260+
-- Should return 1 row
190261
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'BE'})
191262
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
192263
id(n) | n.name | n.iso2
193264
------------------+-----------+--------
194265
1688849860263942 | "Belgium" | "BE"
195266
(1 row)
196267

268+
-- Should return 2 rows for Country with same properties, but different ids
197269
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'AT'})
198270
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
199271
id(n) | n.name | n.iso2
200272
-----------------+-----------+--------
201273
844424930131983 | "Austria" | "AT"
202-
(1 row)
274+
844424930132221 | "Austria" | "AT"
275+
(2 rows)
203276

277+
-- Should return 1 row
204278
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'AT'})
205279
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
206280
id(n) | n.name | n.iso2
207281
------------------+-----------+--------
208282
1688849860263940 | "Austria" | "AT"
209283
(1 row)
210284

285+
-- Should return 2 rows for Country with same properties, but different ids
211286
SELECT * FROM cypher('agload_test_graph', $$
212287
MATCH (u:Country {region : "Europe"})
213288
WHERE u.name =~ 'Cro.*'
214-
RETURN u.name, u.region
215-
$$) AS (result_1 agtype, result_2 agtype);
216-
result_1 | result_2
217-
-----------+----------
218-
"Croatia" | "Europe"
289+
RETURN id(u), u.name, u.region
290+
$$) AS ("id(u)" agtype, result_1 agtype, result_2 agtype);
291+
id(u) | result_1 | result_2
292+
-----------------+-----------+----------
293+
844424930132023 | "Croatia" | "Europe"
294+
844424930132226 | "Croatia" | "Europe"
295+
(2 rows)
296+
297+
-- There shouldn't be any duplicates
298+
SELECT * FROM cypher('agload_test_graph', $$return graph_stats('agload_test_graph')$$) as (a agtype);
299+
a
300+
------------------------------------------------------------------------------------------
301+
{"graph": "agload_test_graph", "num_loaded_edges": 72485, "num_loaded_vertices": 145130}
219302
(1 row)
220303

221304
SELECT drop_graph('agload_test_graph', true);
@@ -236,22 +319,11 @@ NOTICE: graph "agload_test_graph" has been dropped
236319
--
237320
-- Test property type conversion
238321
--
239-
SELECT create_graph('agload_conversion');
240-
NOTICE: graph "agload_conversion" has been created
241-
create_graph
242-
--------------
243-
244-
(1 row)
245-
246322
-- vertex: load as agtype
247-
SELECT create_vlabel('agload_conversion','Person1');
248-
NOTICE: VLabel "Person1" has been created
249-
create_vlabel
250-
---------------
251-
252-
(1 row)
253-
323+
-- Should create graph and label automatically
254324
SELECT load_labels_from_file('agload_conversion', 'Person1', 'age_load/conversion_vertices.csv', true, true);
325+
NOTICE: graph "agload_conversion" has been created
326+
NOTICE: VLabel "Person1" has been created
255327
load_labels_from_file
256328
-----------------------
257329

regress/sql/age_load.sql

+69-8
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,65 @@
2222
LOAD 'age';
2323

2424
SET search_path TO ag_catalog;
25+
26+
-- Create a country using CREATE clause
2527
SELECT create_graph('agload_test_graph');
2628

27-
SELECT create_vlabel('agload_test_graph','Country');
29+
SELECT * FROM cypher('agload_test_graph', $$CREATE (n:Country {__id__:1}) RETURN n$$) as (n agtype);
30+
31+
--
32+
-- Load countries with id
33+
--
34+
SELECT load_labels_from_file('agload_test_graph', 'Country',
35+
'age_load/countries.csv', true);
36+
37+
-- A temporary table should have been created with 54 ids; 1 from CREATE and 53 from file
38+
SELECT COUNT(*)=54 FROM "_agload_test_graph_ag_vertex_ids";
39+
40+
-- Sequence should be equal to max entry id i.e. 248
41+
SELECT currval('agload_test_graph."Country_id_seq"')=248;
42+
43+
-- Should error out on loading the same file again due to duplicate id
2844
SELECT load_labels_from_file('agload_test_graph', 'Country',
29-
'age_load/countries.csv');
45+
'age_load/countries.csv', true);
46+
47+
--
48+
-- Load cities with id
49+
--
50+
51+
-- Should create City label automatically and load cities
52+
SELECT load_labels_from_file('agload_test_graph', 'City',
53+
'age_load/cities.csv', true);
54+
55+
-- Temporary table should have 54+72485 rows now
56+
SELECT COUNT(*)=54+72485 FROM "_agload_test_graph_ag_vertex_ids";
3057

31-
SELECT create_vlabel('agload_test_graph','City');
58+
-- Sequence should be equal to max entry id i.e. 146941
59+
SELECT currval('agload_test_graph."City_id_seq"')=146941;
60+
61+
-- Should error out on loading the same file again due to duplicate id
3262
SELECT load_labels_from_file('agload_test_graph', 'City',
33-
'age_load/cities.csv');
63+
'age_load/cities.csv', true);
64+
65+
--
66+
-- Load edges -- Connects cities to countries
67+
--
68+
69+
-- Should error out for using vertex label
70+
SELECT load_edges_from_file('agload_test_graph', 'Country',
71+
'age_load/edges.csv');
3472

3573
SELECT create_elabel('agload_test_graph','has_city');
3674
SELECT load_edges_from_file('agload_test_graph', 'has_city',
3775
'age_load/edges.csv');
3876

77+
-- Sequence should be equal to number of edges loaded i.e. 72485
78+
SELECT currval('agload_test_graph."has_city_id_seq"')=72485;
79+
80+
-- Should error out for using edge label
81+
SELECT load_labels_from_file('agload_test_graph', 'has_city',
82+
'age_load/cities.csv');
83+
3984
SELECT table_catalog, table_schema, lower(table_name) as table_name, table_type
4085
FROM information_schema.tables
4186
WHERE table_schema = 'agload_test_graph' ORDER BY table_name ASC;
@@ -48,6 +93,14 @@ SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH(n) RETURN n$$) as (n ag
4893

4994
SELECT COUNT(*) FROM cypher('agload_test_graph', $$MATCH (a)-[e]->(b) RETURN e$$) as (n agtype);
5095

96+
--
97+
-- Load countries and cities without id
98+
--
99+
100+
-- Should load countries in Country label without error since it should use sequence now
101+
SELECT load_labels_from_file('agload_test_graph', 'Country',
102+
'age_load/countries.csv', false);
103+
51104
SELECT create_vlabel('agload_test_graph','Country2');
52105
SELECT load_labels_from_file('agload_test_graph', 'Country2',
53106
'age_load/countries.csv', false);
@@ -62,31 +115,39 @@ SELECT COUNT(*) FROM agload_test_graph."City2";
62115
SELECT id FROM agload_test_graph."Country" LIMIT 10;
63116
SELECT id FROM agload_test_graph."Country2" LIMIT 10;
64117

118+
-- Should return 2 rows for Country with same properties, but different ids
65119
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'BE'})
66120
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
121+
-- Should return 1 row
67122
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'BE'})
68123
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
69124

125+
-- Should return 2 rows for Country with same properties, but different ids
70126
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country {iso2 : 'AT'})
71127
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
128+
-- Should return 1 row
72129
SELECT * FROM cypher('agload_test_graph', $$MATCH(n:Country2 {iso2 : 'AT'})
73130
RETURN id(n), n.name, n.iso2 $$) as ("id(n)" agtype, "n.name" agtype, "n.iso2" agtype);
74131

132+
-- Should return 2 rows for Country with same properties, but different ids
75133
SELECT * FROM cypher('agload_test_graph', $$
76134
MATCH (u:Country {region : "Europe"})
77135
WHERE u.name =~ 'Cro.*'
78-
RETURN u.name, u.region
79-
$$) AS (result_1 agtype, result_2 agtype);
136+
RETURN id(u), u.name, u.region
137+
$$) AS ("id(u)" agtype, result_1 agtype, result_2 agtype);
138+
139+
-- There shouldn't be any duplicates
140+
SELECT * FROM cypher('agload_test_graph', $$return graph_stats('agload_test_graph')$$) as (a agtype);
80141

81142
SELECT drop_graph('agload_test_graph', true);
82143

83144
--
84145
-- Test property type conversion
85146
--
86-
SELECT create_graph('agload_conversion');
87147

88148
-- vertex: load as agtype
89-
SELECT create_vlabel('agload_conversion','Person1');
149+
150+
-- Should create graph and label automatically
90151
SELECT load_labels_from_file('agload_conversion', 'Person1', 'age_load/conversion_vertices.csv', true, true);
91152
SELECT * FROM cypher('agload_conversion', $$ MATCH (n:Person1) RETURN properties(n) $$) as (a agtype);
92153

0 commit comments

Comments
 (0)