diff --git a/.gitignore b/.gitignore index 0640ec27..53852d66 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,5 @@ sf*/ sf*.tar sf*.tar.gz +paramgen/__pycache__/ tools/paramgen/__pycache__/ \ No newline at end of file diff --git a/tools/paramgen/parameter_curation.py b/paramgen/parameter_curation.py similarity index 96% rename from tools/paramgen/parameter_curation.py rename to paramgen/parameter_curation.py index 6a4de363..baa29fda 100644 --- a/tools/paramgen/parameter_curation.py +++ b/paramgen/parameter_curation.py @@ -24,7 +24,7 @@ THRESH_HOLD = 0 THRESH_HOLD_6 = 0 -TRUNCATION_LIMIT = 10000 +TRUNCATION_LIMIT = 500 BATCH_SIZE = 5000 TIME_TRUNCATE = True @@ -345,7 +345,7 @@ def process_iter_queries(query_id): else: amount_bucket_path = os.path.join(table_dir, 'trans_withdraw_bucket') time_bucket_path = os.path.join(table_dir, 'trans_withdraw_month') - output_path = os.path.join(out_dir, 'tcr8.txt') + output_path = os.path.join(out_dir, 'complex_8_param.csv') steps = 2 elif query_id == 1: @@ -356,7 +356,7 @@ def process_iter_queries(query_id): else: amount_bucket_path = os.path.join(table_dir, 'transfer_out_bucket') time_bucket_path = os.path.join(table_dir, 'transfer_out_month') - output_path = os.path.join(out_dir, 'tcr1.txt') + output_path = os.path.join(out_dir, 'complex_1_param.csv') steps = 2 elif query_id == 5: @@ -378,7 +378,7 @@ def process_iter_queries(query_id): else: amount_bucket_path = os.path.join(table_dir, 'transfer_in_bucket') time_bucket_path = os.path.join(table_dir, 'transfer_in_month') - output_path = os.path.join(out_dir, 'tcr2.txt') + output_path = os.path.join(out_dir, 'complex_2_param.csv') steps = 3 elif query_id == 3: @@ -394,7 +394,7 @@ def process_iter_queries(query_id): account_account_path = os.path.join(table_dir, 'person_guarantee_list') amount_bucket_path = os.path.join(table_dir, 'person_guarantee_count') time_bucket_path = os.path.join(table_dir, 'person_guarantee_month') - output_path = os.path.join(out_dir, 'tcr11.txt') + output_path = os.path.join(out_dir, 'complex_11_param.csv') steps = 3 first_account_df = process_csv(first_account_path) @@ -474,14 +474,14 @@ def process_iter_queries(query_id): elif query_id == 5: csvWriter_5 = CSVSerializer() - csvWriter_5.setOutputFile(output_path + 'tcr5.txt') + csvWriter_5.setOutputFile(output_path + 'complex_5_param.csv') csvWriter_5.registerHandler(hendleIdParam, final_first_items, "id") csvWriter_5.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") csvWriter_5.registerHandler(handleTruncateLimitParam, truncate_limit_list, "truncationLimit") csvWriter_5.registerHandler(handleTruncateOrderParam, truncate_order_list, "truncationOrder") csvWriter_12 = CSVSerializer() - csvWriter_12.setOutputFile(output_path + 'tcr12.txt') + csvWriter_12.setOutputFile(output_path + 'complex_12_param.csv') csvWriter_12.registerHandler(hendleIdParam, final_first_items, "id") csvWriter_12.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") csvWriter_12.registerHandler(handleTruncateLimitParam, truncate_limit_list, "truncationLimit") @@ -519,7 +519,7 @@ def process_iter_queries(query_id): final_second_items_4.append(final_first_items[k]) csvWriter_3 = CSVSerializer() - csvWriter_3.setOutputFile(output_path + 'tcr3.txt') + csvWriter_3.setOutputFile(output_path + 'complex_3_param.csv') csvWriter_3.registerHandler(hendleIdParam, final_first_items, "id1") csvWriter_3.registerHandler(hendleIdParam, final_second_items_3, "id2") csvWriter_3.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") @@ -527,7 +527,7 @@ def process_iter_queries(query_id): csvWriter_3.registerHandler(handleTruncateOrderParam, truncate_order_list, "truncationOrder") csvWriter_4 = CSVSerializer() - csvWriter_4.setOutputFile(output_path + 'tcr4.txt') + csvWriter_4.setOutputFile(output_path + 'complex_4_param.csv') csvWriter_4.registerHandler(hendleIdParam, final_first_items, "id1") csvWriter_4.registerHandler(hendleIdParam, final_second_items_4, "id2") csvWriter_4.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") @@ -559,7 +559,7 @@ def process_1_hop_query(query_id): elif query_id == 10: first_count_path = os.path.join(table_dir, 'person_invest_company') time_bucket_path = os.path.join(table_dir, 'invest_month') - output_path = os.path.join(out_dir, 'tcr10.txt') + output_path = os.path.join(out_dir, 'complex_10_param.csv') first_count_df = process_csv(first_count_path) time_bucket_df = process_csv(time_bucket_path) @@ -579,7 +579,7 @@ def process_1_hop_query(query_id): if query_id == 7: csvWriter_7 = CSVSerializer() - csvWriter_7.setOutputFile(output_path + 'tcr7.txt') + csvWriter_7.setOutputFile(output_path + 'complex_7_param.csv') csvWriter_7.registerHandler(hendleIdParam, final_first_items, "id") csvWriter_7.registerHandler(handleThresholdParam, thresh_list, "threshold") csvWriter_7.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") @@ -587,7 +587,7 @@ def process_1_hop_query(query_id): csvWriter_7.registerHandler(handleTruncateOrderParam, truncate_order_list, "truncationOrder") csvWriter_9 = CSVSerializer() - csvWriter_9.setOutputFile(output_path + 'tcr9.txt') + csvWriter_9.setOutputFile(output_path + 'complex_9_param.csv') csvWriter_9.registerHandler(hendleIdParam, final_first_items, "id") csvWriter_9.registerHandler(handleThresholdParam, thresh_list, "threshold") csvWriter_9.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") @@ -627,7 +627,7 @@ def process_withdraw_query(): else: withdraw_bucket_path = os.path.join(table_dir, 'withdraw_in_bucket') transfer_bucket_path = os.path.join(table_dir, 'transfer_in_bucket') - output_path = os.path.join(out_dir, 'tcr6.txt') + output_path = os.path.join(out_dir, 'complex_6_param.csv') first_account_df = process_csv(first_account_path) time_bucket_df = process_csv(time_bucket_path) diff --git a/tools/paramgen/search_params.py b/paramgen/search_params.py similarity index 100% rename from tools/paramgen/search_params.py rename to paramgen/search_params.py diff --git a/tools/paramgen/time_select.py b/paramgen/time_select.py similarity index 100% rename from tools/paramgen/time_select.py rename to paramgen/time_select.py diff --git a/scripts/run_paramgen.sh b/scripts/run_paramgen.sh new file mode 100644 index 00000000..67138c61 --- /dev/null +++ b/scripts/run_paramgen.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar +OUTPUT_DIR=out/sf3/ + +# Note: generate factor tables with --generate-factors + +echo "start factor table generation" + +time spark-submit --master local[*] \ + --class ldbc.finbench.datagen.LdbcDatagen \ + --driver-memory 480g \ + ${LDBC_FINBENCH_DATAGEN_JAR} \ + --output-dir ${OUTPUT_DIR} \ + --factor-format csv \ + --generate-factors + +echo "start parameter curation" \ No newline at end of file diff --git a/src/main/resources/scale_factors.xml b/src/main/resources/scale_factors.xml index 18d70cdb..192a223e 100644 --- a/src/main/resources/scale_factors.xml +++ b/src/main/resources/scale_factors.xml @@ -1,40 +1,17 @@ - - - generator.numPersons - 800 - - - generator.numCompanies - 400 - - - generator.numMediums - 1000 - - - transfer.minNumDegree - 1 - - - transfer.maxNumDegree - 1000 - - - generator.numPersons - 8000 + 1000 generator.numCompanies - 4000 + 1000 generator.numMediums - 10000 + 2000 transfer.minNumDegree @@ -49,15 +26,15 @@ generator.numPersons - 24000 + 3000 generator.numCompanies - 12000 + 3000 generator.numMediums - 30000 + 6000 transfer.minNumDegree @@ -92,18 +69,18 @@ - + diff --git a/tools/statistic.py b/tools/statistic.py index 39623a03..45e5a2c9 100644 --- a/tools/statistic.py +++ b/tools/statistic.py @@ -4,10 +4,18 @@ import collections -def print_counts(counts): +labels = ["person","personOwnAccount","personApplyLoan","personGuarantee","personInvest","blank","company","companyOwnAccount","companyApplyLoan","companyGuarantee","companyInvest","blank","account","transfer","withdraw","blank","loan","loantransfer","deposit","repay","blank","medium","signIn"] + +def print_original_counts(counts): for key, value in collections.OrderedDict(sorted(counts.items())).items(): print("{}:{}".format(key, value)) +def print_formatted_counts(counts): + for label in labels: + if label == "blank": + print("================================") + else: + print("{}:{}".format(label, counts[label])) def count_entites(path): counts = {} @@ -18,7 +26,9 @@ def count_entites(path): for file in glob.glob(os.path.join(subdir_path, "*.csv")): num_entites += sum(1 for _ in open(file)) - 1 counts[subdir] = num_entites - print_counts(counts) + print_original_counts(counts) + print("\n========== Formatted Output ============\n") + print_formatted_counts(counts) if __name__ == "__main__":