From 946fc48d753d878ead9183888550c7f0a5e604f8 Mon Sep 17 00:00:00 2001 From: Shipeng Qi Date: Sun, 29 Sep 2024 16:38:44 +0800 Subject: [PATCH] sort files and separate factor with datagen (#111) * sort files * update tools markdown doc * seperate factor and generate * add note --- scripts/run_local.sh | 24 ++ .../ldbc/finbench/datagen/LdbcDatagen.scala | 34 ++- .../factors/FactorGenerationStage.scala | 11 +- .../datagen/generation/GenerationStage.scala | 2 + tools/DataProfiler/result/db139/profile.log | 4 - tools/DataProfiler/result/db177/edges.log | 2 - tools/DataProfiler/result/db177/profile.log | 4 - tools/DataProfiler/result/db184/profile.log | 4 - tools/DataProfiler/result/transfer/edges.log | 2 - .../DataProfiler/result/transfer/profile.log | 4 - tools/README.md | 64 +++++- .../dataprofiler}/.gitignore | 0 .../dataprofiler}/CMakeLists.txt | 0 .../dataprofiler}/README.md | 0 .../dataprofiler}/algo.h | 0 .../dataprofiler}/compile.sh | 0 .../dataprofiler}/de_core.cpp | 0 .../dataprofiler}/plot.py | 0 .../dataprofiler}/profiler.cpp | 0 .../dataprofiler}/result/db139/edges.txt | 0 .../dataprofiler}/result/db139/in-out.txt | 0 .../result/db139/in_degree_dist.png | Bin .../result/db139/in_degree_dist.txt | 0 .../db139/in_degree_dist_regression.png | Bin .../db139/in_degree_dist_regression.txt | 0 .../dataprofiler}/result/db139/out-in.txt | 0 .../result/db139/out_degree_dist.png | Bin .../result/db139/out_degree_dist.txt | 0 .../db139/out_degree_dist_regression.png | Bin .../db139/out_degree_dist_regression.txt | 0 .../dataprofiler}/result/db177/in-out.txt | 0 .../result/db177/in_degree_dist.png | Bin .../result/db177/in_degree_dist.txt | 0 .../db177/in_degree_dist_regression.png | Bin .../db177/in_degree_dist_regression.txt | 0 .../dataprofiler}/result/db177/out-in.txt | 0 .../result/db177/out_degree_dist.png | Bin .../result/db177/out_degree_dist.txt | 0 .../db177/out_degree_dist_regression.png | Bin .../db177/out_degree_dist_regression.txt | 0 .../dataprofiler}/result/db184/edges.txt | 0 .../dataprofiler}/result/db184/in-out.txt | 0 .../result/db184/in_degree_dist.png | Bin .../result/db184/in_degree_dist.txt | 0 .../db184/in_degree_dist_regression.png | Bin .../db184/in_degree_dist_regression.txt | 0 .../dataprofiler}/result/db184/out-in.txt | 0 .../result/db184/out_degree_dist.png | Bin .../result/db184/out_degree_dist.txt | 0 .../db184/out_degree_dist_regression.png | Bin .../db184/out_degree_dist_regression.txt | 0 .../result/hubvertex_indeg/hub_indeg_1.png | Bin .../result/hubvertex_indeg/hub_indeg_1.txt | 0 .../hub_indeg_1_regression.png | Bin .../hub_indeg_1_regression.txt | 0 .../result/hubvertex_indeg/hub_indeg_2.png | Bin .../result/hubvertex_indeg/hub_indeg_2.txt | 0 .../hub_indeg_2_regression.png | Bin .../hub_indeg_2_regression.txt | 0 .../result/hubvertex_indeg/hub_indeg_3.png | Bin .../result/hubvertex_indeg/hub_indeg_3.txt | 0 .../hub_indeg_3_regression.png | Bin .../hub_indeg_3_regression.txt | 0 .../result/hubvertex_indeg/hub_indeg_4.png | Bin .../result/hubvertex_indeg/hub_indeg_4.txt | 0 .../hub_indeg_4_regression.png | Bin .../hub_indeg_4_regression.txt | 0 .../result/hubvertex_indeg/hub_indeg_5.png | Bin .../result/hubvertex_indeg/hub_indeg_5.txt | 0 .../hub_indeg_5_regression.png | Bin .../hub_indeg_5_regression.txt | 0 .../result/hubvertex_outdeg/hub_outdeg_1.png | Bin .../result/hubvertex_outdeg/hub_outdeg_1.txt | 0 .../hub_outdeg_1_regression.png | Bin .../hub_outdeg_1_regression.txt | 0 .../result/hubvertex_outdeg/hub_outdeg_2.png | Bin .../result/hubvertex_outdeg/hub_outdeg_2.txt | 0 .../hub_outdeg_2_regression.png | Bin .../hub_outdeg_2_regression.txt | 0 .../result/hubvertex_outdeg/hub_outdeg_3.png | Bin .../result/hubvertex_outdeg/hub_outdeg_3.txt | 0 .../hub_outdeg_3_regression.png | Bin .../hub_outdeg_3_regression.txt | 0 .../result/hubvertex_outdeg/hub_outdeg_4.png | Bin .../result/hubvertex_outdeg/hub_outdeg_4.txt | 0 .../hub_outdeg_4_regression.png | Bin .../hub_outdeg_4_regression.txt | 0 .../result/hubvertex_outdeg/hub_outdeg_5.png | Bin .../result/hubvertex_outdeg/hub_outdeg_5.txt | 0 .../hub_outdeg_5_regression.png | Bin .../hub_outdeg_5_regression.txt | 0 .../dataprofiler}/result/transfer/in-out.txt | 0 .../result/transfer/in_degree_dist.png | Bin .../result/transfer/in_degree_dist.txt | 0 .../transfer/in_degree_dist_regression.png | Bin .../transfer/in_degree_dist_regression.txt | 0 .../dataprofiler}/result/transfer/out-in.txt | 0 .../result/transfer/out_degree_dist.png | Bin .../result/transfer/out_degree_dist.txt | 0 .../transfer/out_degree_dist_regression.png | Bin .../transfer/out_degree_dist_regression.txt | 0 .../dataprofiler}/wcc_core.cpp | 0 .../factorgen}/factor_table.sh | 0 .../factorgen}/generate_account.py | 0 .../legacy => legacy/factorgen}/loan.py | 0 .../factorgen}/params_gen.properties | 0 .../legacy => legacy/factorgen}/params_gen.py | 0 .../factorgen}/split_amount.py | 0 .../legacy => legacy/factorgen}/time_split.py | 0 tools/{GraphGen => legacy/graphgen}/Makefile | 0 tools/{GraphGen => legacy/graphgen}/README.md | 0 .../{GraphGen => legacy/graphgen}/graph_gen.c | 0 {scripts => tools}/merge_cluster_output.py | 0 tools/paramgen/README.md | 49 ---- tools/paramgen/parameter_curation.py | 73 +++--- tools/paramgen/search_params.py | 214 +++++++++--------- tools/paramgen/time_select.py | 56 +++-- 117 files changed, 281 insertions(+), 266 deletions(-) delete mode 100644 tools/DataProfiler/result/db139/profile.log delete mode 100644 tools/DataProfiler/result/db177/edges.log delete mode 100644 tools/DataProfiler/result/db177/profile.log delete mode 100644 tools/DataProfiler/result/db184/profile.log delete mode 100644 tools/DataProfiler/result/transfer/edges.log delete mode 100644 tools/DataProfiler/result/transfer/profile.log rename tools/{DataProfiler => legacy/dataprofiler}/.gitignore (100%) rename tools/{DataProfiler => legacy/dataprofiler}/CMakeLists.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/README.md (100%) rename tools/{DataProfiler => legacy/dataprofiler}/algo.h (100%) rename tools/{DataProfiler => legacy/dataprofiler}/compile.sh (100%) rename tools/{DataProfiler => legacy/dataprofiler}/de_core.cpp (100%) rename tools/{DataProfiler => legacy/dataprofiler}/plot.py (100%) rename tools/{DataProfiler => legacy/dataprofiler}/profiler.cpp (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/edges.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in-out.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out-in.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in-out.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out-in.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/edges.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in-out.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out-in.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in-out.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out-in.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist_regression.png (100%) rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist_regression.txt (100%) rename tools/{DataProfiler => legacy/dataprofiler}/wcc_core.cpp (100%) rename tools/{paramgen/legacy => legacy/factorgen}/factor_table.sh (100%) rename tools/{paramgen/legacy => legacy/factorgen}/generate_account.py (100%) rename tools/{paramgen/legacy => legacy/factorgen}/loan.py (100%) rename tools/{paramgen/legacy => legacy/factorgen}/params_gen.properties (100%) rename tools/{paramgen/legacy => legacy/factorgen}/params_gen.py (100%) rename tools/{paramgen/legacy => legacy/factorgen}/split_amount.py (100%) rename tools/{paramgen/legacy => legacy/factorgen}/time_split.py (100%) rename tools/{GraphGen => legacy/graphgen}/Makefile (100%) rename tools/{GraphGen => legacy/graphgen}/README.md (100%) rename tools/{GraphGen => legacy/graphgen}/graph_gen.c (100%) rename {scripts => tools}/merge_cluster_output.py (100%) delete mode 100644 tools/paramgen/README.md diff --git a/scripts/run_local.sh b/scripts/run_local.sh index 3185dbe5..bc01c09f 100644 --- a/scripts/run_local.sh +++ b/scripts/run_local.sh @@ -3,12 +3,19 @@ LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar OUTPUT_DIR=out +# Note: generate factor tables with --generate-factors + # run locally with the python script # time python3 scripts/run.py --jar $LDBC_FINBENCH_DATAGEN_JAR --main-class ldbc.finbench.datagen.LdbcDatagen --memory 500g -- --scale-factor 30 --output-dir ${OUTPUT_DIR} # run locally with spark-submit command # **({'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005'}), # Debug # **({'spark.executor.extraJavaOptions': '-verbose:gc -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'}), +# --conf "spark.memory.offHeap.enabled=true" \ +# --conf "spark.memory.offHeap.size=100g" \ +# --conf "spark.storage.memoryFraction=0" \ +# --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ + time spark-submit --master local[*] \ --class ldbc.finbench.datagen.LdbcDatagen \ --driver-memory 480g \ @@ -24,3 +31,20 @@ time spark-submit --master local[*] \ ${LDBC_FINBENCH_DATAGEN_JAR} \ --scale-factor 10 \ --output-dir ${OUTPUT_DIR} + +# currently works on SF100 +#time spark-submit --master local[*] \ +# --class ldbc.finbench.datagen.LdbcDatagen \ +# --driver-memory 400g \ +# --conf "spark.default.parallelism=800" \ +# --conf "spark.shuffle.compress=true" \ +# --conf "spark.shuffle.spill.compress=true" \ +# --conf "spark.kryoserializer.buffer.max=512m" \ +# --conf "spark.driver.maxResultSize=0" \ +# --conf "spark.driver.extraJavaOptions=-Xss512m" \ +# --conf "spark.executor.extraJavaOptions=-Xss512m -XX:+UseG1GC" \ +# --conf "spark.kryo.referenceTracking=false" \ +# ${LDBC_FINBENCH_DATAGEN_JAR} \ +# --scale-factor 100 \ +# --output-dir ${OUTPUT_DIR} + diff --git a/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala b/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala index 18d74096..16f17d7e 100644 --- a/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala +++ b/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala @@ -1,9 +1,7 @@ package ldbc.finbench.datagen import ldbc.finbench.datagen.factors.FactorGenerationStage -import ldbc.finbench.datagen.generation.dictionary.Dictionaries import ldbc.finbench.datagen.generation.GenerationStage -import ldbc.finbench.datagen.transformation.TransformationStage import ldbc.finbench.datagen.util.{Logging, SparkApp} import shapeless.lens @@ -24,7 +22,7 @@ object LdbcDatagen extends SparkApp with Logging { format: String = "csv", formatOptions: Map[String, String] = Map.empty, epochMillis: Boolean = false, - generateFactors: Boolean = true, + generateFactors: Boolean = false, factorFormat: String = "parquet" ) @@ -121,22 +119,22 @@ object LdbcDatagen extends SparkApp with Logging { } override def run(args: ArgsType): Unit = { - val generationArgs = GenerationStage.Args( - scaleFactor = args.scaleFactor, - outputDir = args.outputDir, - format = args.format, - partitionsOpt = args.numPartitions - ) - log.info("[Main] Starting generation stage") - GenerationStage.run(generationArgs) - - if (args.generateFactors) { - val factorArgs = FactorGenerationStage.Args( - outputDir = args.outputDir, - format = args.factorFormat + if (!args.generateFactors) { + GenerationStage.run( + GenerationStage.Args( + scaleFactor = args.scaleFactor, + outputDir = args.outputDir, + format = args.format, + partitionsOpt = args.numPartitions + ) + ) + } else { + FactorGenerationStage.run( + FactorGenerationStage.Args( + outputDir = args.outputDir, + format = args.factorFormat + ) ) - log.info("[Main] Starting factoring stage") -// FactorGenerationStage.run(factorArgs) } } } diff --git a/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala b/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala index e0bf0973..993348d0 100644 --- a/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala +++ b/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala @@ -1,5 +1,6 @@ package ldbc.finbench.datagen.factors +import ldbc.finbench.datagen.LdbcDatagen.log import ldbc.finbench.datagen.util.DatagenStage import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, SparkSession, functions => F} @@ -10,7 +11,6 @@ import shapeless.lens import scala.util.matching.Regex object FactorGenerationStage extends DatagenStage { - @transient lazy val log: Logger = LoggerFactory.getLogger(this.getClass) case class Args( @@ -64,14 +64,14 @@ object FactorGenerationStage extends DatagenStage { run(parsedArgs) } - // execute factorization process - // TODO: finish all + override def run(args: Args) = { - parameterCuration(args) + factortables(args) } - def parameterCuration(args: Args)(implicit spark: SparkSession) = { + def factortables(args: Args)(implicit spark: SparkSession) = { import spark.implicits._ + log.info("[Main] Starting factoring stage") val transferRDD = spark.read .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") @@ -533,6 +533,5 @@ object FactorGenerationStage extends DatagenStage { .option("delimiter", "|") .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") .save(s"${args.outputDir}/factor_table/upstream_amount") - } } diff --git a/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala b/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala index 3283a7ac..72d4c6b2 100644 --- a/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala +++ b/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala @@ -1,5 +1,6 @@ package ldbc.finbench.datagen.generation +import ldbc.finbench.datagen.LdbcDatagen.log import ldbc.finbench.datagen.config.{ConfigParser, DatagenConfiguration} import ldbc.finbench.datagen.io.raw.{Csv, Parquet, RawSink} import ldbc.finbench.datagen.util._ @@ -25,6 +26,7 @@ object GenerationStage extends DatagenStage with Logging { override type ArgsType = Args override def run(args: Args): Unit = { + log.info("[Main] Starting generation stage") // build and initialize the configs val config = buildConfig(args) // OPT: It is called in each SparkGenerator in Spark to initialize the context on the executors. diff --git a/tools/DataProfiler/result/db139/profile.log b/tools/DataProfiler/result/db139/profile.log deleted file mode 100644 index 1ce23716..00000000 --- a/tools/DataProfiler/result/db139/profile.log +++ /dev/null @@ -1,4 +0,0 @@ -Run degree profiling. -load_cost = 547.45(s) -exec_cost = 897.82(s) -total_cost = 1445.27(s) diff --git a/tools/DataProfiler/result/db177/edges.log b/tools/DataProfiler/result/db177/edges.log deleted file mode 100644 index f189d9ae..00000000 --- a/tools/DataProfiler/result/db177/edges.log +++ /dev/null @@ -1,2 +0,0 @@ -V 242919058, E 508196712, E/V 2.09204 -Unique edges: 2.58058e+08 / 508196712, Multiplicity: 1.96931 diff --git a/tools/DataProfiler/result/db177/profile.log b/tools/DataProfiler/result/db177/profile.log deleted file mode 100644 index d702927a..00000000 --- a/tools/DataProfiler/result/db177/profile.log +++ /dev/null @@ -1,4 +0,0 @@ -Run degree profiling. -load_cost = 418.57(s) -exec_cost = 1180.35(s) -total_cost = 1598.92(s) diff --git a/tools/DataProfiler/result/db184/profile.log b/tools/DataProfiler/result/db184/profile.log deleted file mode 100644 index 21a09254..00000000 --- a/tools/DataProfiler/result/db184/profile.log +++ /dev/null @@ -1,4 +0,0 @@ -Run degree profiling. -load_cost = 377.06(s) -exec_cost = 713.83(s) -total_cost = 1090.90(s) diff --git a/tools/DataProfiler/result/transfer/edges.log b/tools/DataProfiler/result/transfer/edges.log deleted file mode 100644 index 71307300..00000000 --- a/tools/DataProfiler/result/transfer/edges.log +++ /dev/null @@ -1,2 +0,0 @@ -V 181123257, E 527220984, E/V 2.91084 -Unique edges: 2.52555e+08 / 527220984, Multiplicity: 2.08755 diff --git a/tools/DataProfiler/result/transfer/profile.log b/tools/DataProfiler/result/transfer/profile.log deleted file mode 100644 index 0b7c6899..00000000 --- a/tools/DataProfiler/result/transfer/profile.log +++ /dev/null @@ -1,4 +0,0 @@ -Run degree profiling. -load_cost = 399.40(s) -exec_cost = 893.24(s) -total_cost = 1292.64(s) diff --git a/tools/README.md b/tools/README.md index 352d8d7c..73177dda 100644 --- a/tools/README.md +++ b/tools/README.md @@ -1,6 +1,62 @@ # Tools -Here lists some tools for graph data processing. -- dataprofiler: a tool for profiling graph data, including degree distribution, etc. -- graphgen: a simple tool/example code to generate power-law distributed graph data. -- paramgen: a Parameter Search tool to generate parameters for queries using TuGraph. \ No newline at end of file +- paramgen: + - parameter_curation: a tool for generating parameters for finbench queries +- check_*.py: python scripts used for check the data features like consistency, distribution +- merge_cluster_output.py: a python script to merge the output in cluster mode +- statistic.py: a python script to calculate the statistics of the data +- legacy: some legacy tools + - dataprofiler: a tool for profiling graph data, including degree distribution, etc. + - graphgen: a simple tool/example code to generate power-law distributed graph data. + - factorgen: factor table generators in python version + + +## ParamsGen + +`params_gen.py` uses the CREATE_VALIDATION feature to generate parameters. + +The specific steps are as follows: + +1. Select vertices of type Account, Person, and Loan from the dataset, and generate a parameter file that meets the input specifications for ldbc_finbench_driver. +2. Execute CREATE_VALIDATION to generate validation_params.csv. +3. Select non-empty results from validation_params.csv. + +Example: + +```bash +python3 params_gen.py 1 # gen tcr1 params +``` + +Other notes: + +1. The generated start_timestamp and end_timestamp in the current version are fixed values. +2. For tcr4 and tcr10, this method is not efficient enough. Use the following Cypher query to search for parameters: + +```Cypher +// tcr4 +MATCH + (n1:Account)-[:transfer]-> + (n2:Account)-[:transfer]-> + (n3:Account)-[:transfer]->(n4:Account) +WHERE + n1.id = n4.id AND n1.id > n2.id AND n2.id > n3.id +WITH + n1.id as n1id, + n2.id as n2id, + n3.id as n3id, + n4.id as n4id +LIMIT 1000 +RETURN DISTINCT toString(n1id)+"|"+toString(n2id) + +// tcr10 +MATCH + (c:Company)<-[:invest]-(p:Person) +WITH + c.id as cid, + count(p.id) as num, + collect(p.id) as person +WHERE num >= 2 +RETURN + tostring(person[0])+"|"+tostring(person[1]) +LIMIT 1000 +``` diff --git a/tools/DataProfiler/.gitignore b/tools/legacy/dataprofiler/.gitignore similarity index 100% rename from tools/DataProfiler/.gitignore rename to tools/legacy/dataprofiler/.gitignore diff --git a/tools/DataProfiler/CMakeLists.txt b/tools/legacy/dataprofiler/CMakeLists.txt similarity index 100% rename from tools/DataProfiler/CMakeLists.txt rename to tools/legacy/dataprofiler/CMakeLists.txt diff --git a/tools/DataProfiler/README.md b/tools/legacy/dataprofiler/README.md similarity index 100% rename from tools/DataProfiler/README.md rename to tools/legacy/dataprofiler/README.md diff --git a/tools/DataProfiler/algo.h b/tools/legacy/dataprofiler/algo.h similarity index 100% rename from tools/DataProfiler/algo.h rename to tools/legacy/dataprofiler/algo.h diff --git a/tools/DataProfiler/compile.sh b/tools/legacy/dataprofiler/compile.sh similarity index 100% rename from tools/DataProfiler/compile.sh rename to tools/legacy/dataprofiler/compile.sh diff --git a/tools/DataProfiler/de_core.cpp b/tools/legacy/dataprofiler/de_core.cpp similarity index 100% rename from tools/DataProfiler/de_core.cpp rename to tools/legacy/dataprofiler/de_core.cpp diff --git a/tools/DataProfiler/plot.py b/tools/legacy/dataprofiler/plot.py similarity index 100% rename from tools/DataProfiler/plot.py rename to tools/legacy/dataprofiler/plot.py diff --git a/tools/DataProfiler/profiler.cpp b/tools/legacy/dataprofiler/profiler.cpp similarity index 100% rename from tools/DataProfiler/profiler.cpp rename to tools/legacy/dataprofiler/profiler.cpp diff --git a/tools/DataProfiler/result/db139/edges.txt b/tools/legacy/dataprofiler/result/db139/edges.txt similarity index 100% rename from tools/DataProfiler/result/db139/edges.txt rename to tools/legacy/dataprofiler/result/db139/edges.txt diff --git a/tools/DataProfiler/result/db139/in-out.txt b/tools/legacy/dataprofiler/result/db139/in-out.txt similarity index 100% rename from tools/DataProfiler/result/db139/in-out.txt rename to tools/legacy/dataprofiler/result/db139/in-out.txt diff --git a/tools/DataProfiler/result/db139/in_degree_dist.png b/tools/legacy/dataprofiler/result/db139/in_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/db139/in_degree_dist.png rename to tools/legacy/dataprofiler/result/db139/in_degree_dist.png diff --git a/tools/DataProfiler/result/db139/in_degree_dist.txt b/tools/legacy/dataprofiler/result/db139/in_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/db139/in_degree_dist.txt rename to tools/legacy/dataprofiler/result/db139/in_degree_dist.txt diff --git a/tools/DataProfiler/result/db139/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/db139/in_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png diff --git a/tools/DataProfiler/result/db139/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/db139/in_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/db139/out-in.txt b/tools/legacy/dataprofiler/result/db139/out-in.txt similarity index 100% rename from tools/DataProfiler/result/db139/out-in.txt rename to tools/legacy/dataprofiler/result/db139/out-in.txt diff --git a/tools/DataProfiler/result/db139/out_degree_dist.png b/tools/legacy/dataprofiler/result/db139/out_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/db139/out_degree_dist.png rename to tools/legacy/dataprofiler/result/db139/out_degree_dist.png diff --git a/tools/DataProfiler/result/db139/out_degree_dist.txt b/tools/legacy/dataprofiler/result/db139/out_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/db139/out_degree_dist.txt rename to tools/legacy/dataprofiler/result/db139/out_degree_dist.txt diff --git a/tools/DataProfiler/result/db139/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/db139/out_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png diff --git a/tools/DataProfiler/result/db139/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/db139/out_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/db177/in-out.txt b/tools/legacy/dataprofiler/result/db177/in-out.txt similarity index 100% rename from tools/DataProfiler/result/db177/in-out.txt rename to tools/legacy/dataprofiler/result/db177/in-out.txt diff --git a/tools/DataProfiler/result/db177/in_degree_dist.png b/tools/legacy/dataprofiler/result/db177/in_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/db177/in_degree_dist.png rename to tools/legacy/dataprofiler/result/db177/in_degree_dist.png diff --git a/tools/DataProfiler/result/db177/in_degree_dist.txt b/tools/legacy/dataprofiler/result/db177/in_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/db177/in_degree_dist.txt rename to tools/legacy/dataprofiler/result/db177/in_degree_dist.txt diff --git a/tools/DataProfiler/result/db177/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/db177/in_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png diff --git a/tools/DataProfiler/result/db177/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/db177/in_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/db177/out-in.txt b/tools/legacy/dataprofiler/result/db177/out-in.txt similarity index 100% rename from tools/DataProfiler/result/db177/out-in.txt rename to tools/legacy/dataprofiler/result/db177/out-in.txt diff --git a/tools/DataProfiler/result/db177/out_degree_dist.png b/tools/legacy/dataprofiler/result/db177/out_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/db177/out_degree_dist.png rename to tools/legacy/dataprofiler/result/db177/out_degree_dist.png diff --git a/tools/DataProfiler/result/db177/out_degree_dist.txt b/tools/legacy/dataprofiler/result/db177/out_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/db177/out_degree_dist.txt rename to tools/legacy/dataprofiler/result/db177/out_degree_dist.txt diff --git a/tools/DataProfiler/result/db177/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/db177/out_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png diff --git a/tools/DataProfiler/result/db177/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/db177/out_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/db184/edges.txt b/tools/legacy/dataprofiler/result/db184/edges.txt similarity index 100% rename from tools/DataProfiler/result/db184/edges.txt rename to tools/legacy/dataprofiler/result/db184/edges.txt diff --git a/tools/DataProfiler/result/db184/in-out.txt b/tools/legacy/dataprofiler/result/db184/in-out.txt similarity index 100% rename from tools/DataProfiler/result/db184/in-out.txt rename to tools/legacy/dataprofiler/result/db184/in-out.txt diff --git a/tools/DataProfiler/result/db184/in_degree_dist.png b/tools/legacy/dataprofiler/result/db184/in_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/db184/in_degree_dist.png rename to tools/legacy/dataprofiler/result/db184/in_degree_dist.png diff --git a/tools/DataProfiler/result/db184/in_degree_dist.txt b/tools/legacy/dataprofiler/result/db184/in_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/db184/in_degree_dist.txt rename to tools/legacy/dataprofiler/result/db184/in_degree_dist.txt diff --git a/tools/DataProfiler/result/db184/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/db184/in_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png diff --git a/tools/DataProfiler/result/db184/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/db184/in_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/db184/out-in.txt b/tools/legacy/dataprofiler/result/db184/out-in.txt similarity index 100% rename from tools/DataProfiler/result/db184/out-in.txt rename to tools/legacy/dataprofiler/result/db184/out-in.txt diff --git a/tools/DataProfiler/result/db184/out_degree_dist.png b/tools/legacy/dataprofiler/result/db184/out_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/db184/out_degree_dist.png rename to tools/legacy/dataprofiler/result/db184/out_degree_dist.png diff --git a/tools/DataProfiler/result/db184/out_degree_dist.txt b/tools/legacy/dataprofiler/result/db184/out_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/db184/out_degree_dist.txt rename to tools/legacy/dataprofiler/result/db184/out_degree_dist.txt diff --git a/tools/DataProfiler/result/db184/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/db184/out_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png diff --git a/tools/DataProfiler/result/db184/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/db184/out_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.txt diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.txt diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt similarity index 100% rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt diff --git a/tools/DataProfiler/result/transfer/in-out.txt b/tools/legacy/dataprofiler/result/transfer/in-out.txt similarity index 100% rename from tools/DataProfiler/result/transfer/in-out.txt rename to tools/legacy/dataprofiler/result/transfer/in-out.txt diff --git a/tools/DataProfiler/result/transfer/in_degree_dist.png b/tools/legacy/dataprofiler/result/transfer/in_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/transfer/in_degree_dist.png rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist.png diff --git a/tools/DataProfiler/result/transfer/in_degree_dist.txt b/tools/legacy/dataprofiler/result/transfer/in_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/transfer/in_degree_dist.txt rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist.txt diff --git a/tools/DataProfiler/result/transfer/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/transfer/in_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png diff --git a/tools/DataProfiler/result/transfer/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/transfer/in_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.txt diff --git a/tools/DataProfiler/result/transfer/out-in.txt b/tools/legacy/dataprofiler/result/transfer/out-in.txt similarity index 100% rename from tools/DataProfiler/result/transfer/out-in.txt rename to tools/legacy/dataprofiler/result/transfer/out-in.txt diff --git a/tools/DataProfiler/result/transfer/out_degree_dist.png b/tools/legacy/dataprofiler/result/transfer/out_degree_dist.png similarity index 100% rename from tools/DataProfiler/result/transfer/out_degree_dist.png rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist.png diff --git a/tools/DataProfiler/result/transfer/out_degree_dist.txt b/tools/legacy/dataprofiler/result/transfer/out_degree_dist.txt similarity index 100% rename from tools/DataProfiler/result/transfer/out_degree_dist.txt rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist.txt diff --git a/tools/DataProfiler/result/transfer/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png similarity index 100% rename from tools/DataProfiler/result/transfer/out_degree_dist_regression.png rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png diff --git a/tools/DataProfiler/result/transfer/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.txt similarity index 100% rename from tools/DataProfiler/result/transfer/out_degree_dist_regression.txt rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.txt diff --git a/tools/DataProfiler/wcc_core.cpp b/tools/legacy/dataprofiler/wcc_core.cpp similarity index 100% rename from tools/DataProfiler/wcc_core.cpp rename to tools/legacy/dataprofiler/wcc_core.cpp diff --git a/tools/paramgen/legacy/factor_table.sh b/tools/legacy/factorgen/factor_table.sh similarity index 100% rename from tools/paramgen/legacy/factor_table.sh rename to tools/legacy/factorgen/factor_table.sh diff --git a/tools/paramgen/legacy/generate_account.py b/tools/legacy/factorgen/generate_account.py similarity index 100% rename from tools/paramgen/legacy/generate_account.py rename to tools/legacy/factorgen/generate_account.py diff --git a/tools/paramgen/legacy/loan.py b/tools/legacy/factorgen/loan.py similarity index 100% rename from tools/paramgen/legacy/loan.py rename to tools/legacy/factorgen/loan.py diff --git a/tools/paramgen/legacy/params_gen.properties b/tools/legacy/factorgen/params_gen.properties similarity index 100% rename from tools/paramgen/legacy/params_gen.properties rename to tools/legacy/factorgen/params_gen.properties diff --git a/tools/paramgen/legacy/params_gen.py b/tools/legacy/factorgen/params_gen.py similarity index 100% rename from tools/paramgen/legacy/params_gen.py rename to tools/legacy/factorgen/params_gen.py diff --git a/tools/paramgen/legacy/split_amount.py b/tools/legacy/factorgen/split_amount.py similarity index 100% rename from tools/paramgen/legacy/split_amount.py rename to tools/legacy/factorgen/split_amount.py diff --git a/tools/paramgen/legacy/time_split.py b/tools/legacy/factorgen/time_split.py similarity index 100% rename from tools/paramgen/legacy/time_split.py rename to tools/legacy/factorgen/time_split.py diff --git a/tools/GraphGen/Makefile b/tools/legacy/graphgen/Makefile similarity index 100% rename from tools/GraphGen/Makefile rename to tools/legacy/graphgen/Makefile diff --git a/tools/GraphGen/README.md b/tools/legacy/graphgen/README.md similarity index 100% rename from tools/GraphGen/README.md rename to tools/legacy/graphgen/README.md diff --git a/tools/GraphGen/graph_gen.c b/tools/legacy/graphgen/graph_gen.c similarity index 100% rename from tools/GraphGen/graph_gen.c rename to tools/legacy/graphgen/graph_gen.c diff --git a/scripts/merge_cluster_output.py b/tools/merge_cluster_output.py similarity index 100% rename from scripts/merge_cluster_output.py rename to tools/merge_cluster_output.py diff --git a/tools/paramgen/README.md b/tools/paramgen/README.md deleted file mode 100644 index 41d0e732..00000000 --- a/tools/paramgen/README.md +++ /dev/null @@ -1,49 +0,0 @@ -## ParamsGen - -`params_gen.py` uses the CREATE_VALIDATION feature to generate parameters. - -The specific steps are as follows: - -1. Select vertices of type Account, Person, and Loan from the dataset, and generate a parameter file that meets the input specifications for ldbc_finbench_driver. -2. Execute CREATE_VALIDATION to generate validation_params.csv. -3. Select non-empty results from validation_params.csv. - -Example: - -```bash -python3 params_gen.py 1 # gen tcr1 params -``` - -Other notes: - -1. The generated start_timestamp and end_timestamp in the current version are fixed values. -2. For tcr4 and tcr10, this method is not efficient enough. Use the following Cypher query to search for parameters: - -```Cypher -// tcr4 -MATCH - (n1:Account)-[:transfer]-> - (n2:Account)-[:transfer]-> - (n3:Account)-[:transfer]->(n4:Account) -WHERE - n1.id = n4.id AND n1.id > n2.id AND n2.id > n3.id -WITH - n1.id as n1id, - n2.id as n2id, - n3.id as n3id, - n4.id as n4id -LIMIT 1000 -RETURN DISTINCT toString(n1id)+"|"+toString(n2id) - -// tcr10 -MATCH - (c:Company)<-[:invest]-(p:Person) -WITH - c.id as cid, - count(p.id) as num, - collect(p.id) as person -WHERE num >= 2 -RETURN - tostring(person[0])+"|"+tostring(person[1]) -LIMIT 1000 -``` diff --git a/tools/paramgen/parameter_curation.py b/tools/paramgen/parameter_curation.py index 7c92e6f2..6a4de363 100644 --- a/tools/paramgen/parameter_curation.py +++ b/tools/paramgen/parameter_curation.py @@ -17,6 +17,10 @@ from glob import glob import concurrent.futures from functools import partial +import sys + +table_dir = sys.argv[1] +out_dir = sys.argv[2] THRESH_HOLD = 0 THRESH_HOLD_6 = 0 @@ -29,20 +33,16 @@ else: TRUNCATION_ORDER = "AMOUNT_DESCENDING" -table_dir = '../../out/factor_table' -out_dir = '../../out/substitute_parameters/' def process_csv(file_path): all_files = glob(file_path + '/*.csv') - df_list = [] - + for filename in all_files: df = pd.read_csv(filename, delimiter='|') df_list.append(df) - + combined_df = pd.concat(df_list, ignore_index=True) - return combined_df @@ -52,7 +52,7 @@ def __init__(self): self.inputs = [] def setOutputFile(self, outputFile): - self.outputFile=outputFile + self.outputFile = outputFile def registerHandler(self, handler, inputParams, header): handler.header = header @@ -63,7 +63,7 @@ def writeCSV(self): dir_path = os.path.dirname(self.outputFile) if not os.path.exists(dir_path): os.makedirs(dir_path) - output = codecs.open( self.outputFile, "w",encoding="utf-8") + output = codecs.open(self.outputFile, "w", encoding="utf-8") if len(self.inputs) == 0: return @@ -105,7 +105,8 @@ def find_neighbors(account_list, account_account_df, account_amount_df, amount_b transfer_in_amount = account_amount_df.loc[item]['amount'] except KeyError: transfer_in_amount = 0 - result.update(neighbors_with_truncate_threshold(transfer_in_amount, rows_account_list, rows_amount_bucket, num_list)) + result.update( + neighbors_with_truncate_threshold(transfer_in_amount, rows_account_list, rows_amount_bucket, num_list)) elif query_id in [1, 2, 5]: for item in account_list: @@ -131,7 +132,6 @@ def find_neighbors(account_list, account_account_df, account_amount_df, amount_b def filter_neighbors(account_list, amount_bucket_df, num_list, account_id): - # print(f'account_list: {account_list}') rows_amount_bucket = amount_bucket_df.loc[account_id] @@ -144,20 +144,20 @@ def filter_neighbors(account_list, amount_bucket_df, num_list, account_id): header_at_limit = int(col) break - partial_apply = partial(filter_neighbors_with_truncate_threshold, amount_bucket_df=amount_bucket_df, num_list=num_list, header_at_limit=header_at_limit) + partial_apply = partial(filter_neighbors_with_truncate_threshold, amount_bucket_df=amount_bucket_df, + num_list=num_list, header_at_limit=header_at_limit) account_mapped = map(partial_apply, account_list) result = [x for x in account_mapped if x is not None] return result def filter_neighbors_with_truncate_threshold(item, amount_bucket_df, num_list, header_at_limit): - if item[1] > THRESH_HOLD_6: if header_at_limit == -1: return item[0] if (TIME_TRUNCATE and item[2] >= header_at_limit) or (not TIME_TRUNCATE and item[1] >= header_at_limit): return item[0] - + return None @@ -183,7 +183,7 @@ def neighbors_with_truncate_threshold(transfer_in_amount, rows_account_list, row return [t[0] for t in temp if t[1] >= header_at_limit] else: return [t[0] for t in temp] - + def neighbors_with_trancate(rows_account_list, rows_amount_bucket, num_list): if rows_amount_bucket is None: @@ -215,7 +215,6 @@ def process_get_neighbors(chunk, account_account_df, account_amount_df, amount_b def process_filter_neighbors(chunk, amount_bucket_df, num_list): - first_column_name = chunk.columns[0] second_column_name = chunk.columns[1] @@ -227,7 +226,6 @@ def process_filter_neighbors(chunk, amount_bucket_df, num_list): def get_next_neighbor_list(neighbors_df, account_account_df, account_amount_df, amount_bucket_df, query_id): - num_list = [] if query_id != 3 and query_id != 11: num_list = [x for x in amount_bucket_df.columns.tolist()] @@ -237,10 +235,12 @@ def get_next_neighbor_list(neighbors_df, account_account_df, account_amount_df, chunks = np.array_split(neighbors_df, query_parallelism) with concurrent.futures.ProcessPoolExecutor(max_workers=query_parallelism) as executor: - futures = [executor.submit(process_get_neighbors, chunk, account_account_df, account_amount_df, amount_bucket_df, num_list, query_id) for chunk in chunks] + futures = [ + executor.submit(process_get_neighbors, chunk, account_account_df, account_amount_df, amount_bucket_df, + num_list, query_id) for chunk in chunks] results = [future.result() for future in concurrent.futures.as_completed(futures)] executor.shutdown(wait=True) - + next_neighbors_df = pd.concat(results) next_neighbors_df = next_neighbors_df.sort_index() return next_neighbors_df @@ -271,16 +271,18 @@ def process_batch(batch, basic_sum_df, first_column_name, second_column_name): ).drop(columns=[second_column_name]) return merged_df.groupby(first_column_name).sum() + def get_next_sum_table(neighbors_df, basic_sum_df, batch_size=BATCH_SIZE): first_column_name = neighbors_df.columns[0] second_column_name = neighbors_df.columns[1] batches = [neighbors_df.iloc[start:start + batch_size] for start in range(0, len(neighbors_df), batch_size)] - + result_list = [] query_parallelism = max(1, multiprocessing.cpu_count() // 4) with concurrent.futures.ProcessPoolExecutor(max_workers=query_parallelism) as executor: - futures = [executor.submit(process_batch, batch, basic_sum_df, first_column_name, second_column_name) for batch in batches] + futures = [executor.submit(process_batch, batch, basic_sum_df, first_column_name, second_column_name) for batch + in batches] for future in futures: result_list.append(future.result()) executor.shutdown(wait=True) @@ -293,28 +295,32 @@ def get_next_sum_table(neighbors_df, basic_sum_df, batch_size=BATCH_SIZE): def handleThresholdParam(threshold): return str(threshold) + def handleThreshold2Param(threshold2): return str(threshold2) + def hendleIdParam(id): return str(id) + def handleTruncateLimitParam(truncateLimit): return str(truncateLimit) + def handleTruncateOrderParam(truncateOrder): return truncateOrder def handleTimeDurationParam(timeParam): - start = timegm(date(year=int(timeParam.year), month=int(timeParam.month), day=int(timeParam.day)).timetuple())*1000 + start = timegm( + date(year=int(timeParam.year), month=int(timeParam.month), day=int(timeParam.day)).timetuple()) * 1000 end = start + timeParam.duration * 3600 * 24 * 1000 res = str(start) + "|" + str(end) return res def process_query(query_id): - if query_id in [7, 10]: process_1_hop_query(query_id) elif query_id in [1, 2, 3, 5, 8, 11]: @@ -324,7 +330,6 @@ def process_query(query_id): def process_iter_queries(query_id): - upstream_amount_df = None upstream_amount_path = None amount_bucket_path = None @@ -392,7 +397,6 @@ def process_iter_queries(query_id): output_path = os.path.join(out_dir, 'tcr11.txt') steps = 3 - first_account_df = process_csv(first_account_path) account_account_df = process_csv(account_account_path) if query_id == 8: @@ -407,7 +411,7 @@ def process_iter_queries(query_id): first_account_name = account_account_df.columns[0] second_account_name = account_account_df.columns[1] first_amount_name = amount_bucket_df.columns[0] - first_time_name = time_bucket_df.columns[0] + first_time_name = time_bucket_df.columns[0] account_account_df[second_account_name] = account_account_df[second_account_name].apply(literal_eval) first_account_df[second_column_name] = first_account_df[second_column_name].apply(literal_eval) @@ -434,7 +438,8 @@ def process_iter_queries(query_id): if current_step == steps - 1: next_time_bucket = get_next_sum_table(first_neighbors_df, time_bucket_df) else: - first_neighbors_df = get_next_neighbor_list(first_neighbors_df, account_account_df, upstream_amount_df, amount_bucket_df, query_id) + first_neighbors_df = get_next_neighbor_list(first_neighbors_df, account_account_df, upstream_amount_df, + amount_bucket_df, query_id) current_step += 1 @@ -506,13 +511,13 @@ def process_iter_queries(query_id): j = 0 k = 0 while True: - j = random.randint(0, len(final_first_items)-1) - k = random.randint(0, len(final_first_items)-1) + j = random.randint(0, len(final_first_items) - 1) + k = random.randint(0, len(final_first_items) - 1) if final_first_items[j] != account_id and final_first_items[k] != account_id: break final_second_items_3.append(final_first_items[j]) final_second_items_4.append(final_first_items[k]) - + csvWriter_3 = CSVSerializer() csvWriter_3.setOutputFile(output_path + 'tcr3.txt') csvWriter_3.registerHandler(hendleIdParam, final_first_items, "id1") @@ -520,7 +525,7 @@ def process_iter_queries(query_id): csvWriter_3.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime") csvWriter_3.registerHandler(handleTruncateLimitParam, truncate_limit_list, "truncationLimit") csvWriter_3.registerHandler(handleTruncateOrderParam, truncate_order_list, "truncationOrder") - + csvWriter_4 = CSVSerializer() csvWriter_4.setOutputFile(output_path + 'tcr4.txt') csvWriter_4.registerHandler(hendleIdParam, final_first_items, "id1") @@ -547,7 +552,6 @@ def process_iter_queries(query_id): def process_1_hop_query(query_id): - if query_id == 7: first_count_path = os.path.join(table_dir, 'account_in_out_count') time_bucket_path = os.path.join(table_dir, 'account_in_out_month') @@ -594,13 +598,13 @@ def process_1_hop_query(query_id): csvWriter_9.writeCSV() print(f'query_id 7 and 9 finished') - + elif query_id == 10: final_second_items = [] for person in final_first_items: j = 0 while True: - j = random.randint(0, len(final_first_items)-1) + j = random.randint(0, len(final_first_items) - 1) if final_first_items[j] != person: break final_second_items.append(final_first_items[j]) @@ -616,7 +620,6 @@ def process_1_hop_query(query_id): def process_withdraw_query(): - first_account_path = os.path.join(table_dir, 'account_withdraw_in_items') time_bucket_path = os.path.join(table_dir, 'transfer_in_month') if TIME_TRUNCATE: @@ -640,7 +643,7 @@ def process_withdraw_query(): withdraw_bucket_df.set_index(withdraw_first_name, inplace=True) transfer_bucket_df.set_index(transfer_first_name, inplace=True) time_bucket_df.set_index(time_first_name, inplace=True) - + first_account_df[second_column_name] = first_account_df[second_column_name].apply(literal_eval) first_neighbors_df = first_account_df.sort_values(by=first_column_name) diff --git a/tools/paramgen/search_params.py b/tools/paramgen/search_params.py index 722cbaf2..3d104270 100644 --- a/tools/paramgen/search_params.py +++ b/tools/paramgen/search_params.py @@ -3,145 +3,149 @@ def allclose(a, b, rtol=1e-05, atol=1e-08): - return abs(a - b) <= (atol + rtol * abs(b)) + return abs(a - b) <= (atol + rtol * abs(b)) + def readFactors(f): - res = [] - for line in f.readlines(): - values = [item if index == 0 else int(item) for (index, item) in enumerate(line.split(","))] - res.append(values) + res = [] + for line in f.readlines(): + values = [item if index == 0 else int(item) for (index, item) in enumerate(line.split(","))] + res.append(values) + + return res - return res class Window: - def __init__(self, paramId, start, end): - self.param = paramId - self.start = start - self.end = end - self.avg = 0.0 - self.stddev = 0.0 - self.size = end-start+1 - - def __str__(self): - res = "[%d, %d] "%(self.start, self.end) - res += "size: %d, avg: %0.2f, stddev: %0.2f" % (self.size, self.avg, self.stddev) - return res + def __init__(self, paramId, start, end): + self.param = paramId + self.start = start + self.end = end + self.avg = 0.0 + self.stddev = 0.0 + self.size = end - start + 1 + + def __str__(self): + res = "[%d, %d] " % (self.start, self.end) + res += "size: %d, avg: %0.2f, stddev: %0.2f" % (self.size, self.avg, self.stddev) + return res + def getAverageCost(rows, key): - return float(sum([key(r) for r in rows])) / len(rows) + return float(sum([key(r) for r in rows])) / len(rows) + def getCostStdDev(rows, avg, key): - return math.sqrt(sum([math.pow(key(r)-avg,2) for r in rows]) / len(rows)) + return math.sqrt(sum([math.pow(key(r) - avg, 2) for r in rows]) / len(rows)) + def updateAverageCost(avg, oldelem, newelem, samplesize): - return avg + (newelem - oldelem) / samplesize + return avg + (newelem - oldelem) / samplesize def findWindows(factors, param, amount, bounds): + data = factors[bounds[0]: bounds[1]] + allWindows = [] + start = 0 - data = factors[bounds[0]: bounds[1]] - allWindows = [] - start = 0 + initWindow = Window(param, start, amount - 1) + initWindow.avg = getAverageCost(data[start:amount], itemgetter(param)) + initWindow.stddev = getCostStdDev(data[start:amount], initWindow.avg, itemgetter(param)) - initWindow = Window(param, start, amount-1) - initWindow.avg = getAverageCost(data[start:amount], itemgetter(param)) - initWindow.stddev = getCostStdDev(data[start:amount], initWindow.avg, itemgetter(param)) + s1 = sum([x[param] for x in data[start:start + amount]]) + s2 = sum([x[param] * x[param] for x in data[start:start + amount]]) + start += 1 + allWindows.append(initWindow) - s1 = sum([x[param] for x in data[start:start+amount]]) - s2 = sum([x[param]*x[param] for x in data[start:start+amount]]) - start += 1 - allWindows.append(initWindow) + while start + amount < len(data): + end = start + amount + if data[end - 1][param] < 10: + break - while start + amount < len(data): - end = start + amount - if data[end-1][param]<10: - break + window = Window(param, bounds[0] + start, bounds[0] + end - 1) - window = Window(param, bounds[0]+start, bounds[0]+end-1) + # update the streaming stats about avg and stddev + s1 -= data[start - 1][param] + s1 += data[end - 1][param] + s2 -= data[start - 1][param] * data[start - 1][param] + s2 += data[end - 1][param] * data[end - 1][param] - # update the streaming stats about avg and stddev - s1 -= data[start-1][param] - s1 += data[end-1][param] - s2 -= data[start-1][param]*data[start-1][param] - s2 += data[end-1][param]*data[end-1][param] + window.avg = float(s1) / amount + window.stddev = math.sqrt(float(amount * s2 - s1 * s1)) / amount - window.avg = float(s1) / amount - window.stddev = math.sqrt(float(amount*s2 - s1*s1))/amount + allWindows.append(window) + start += 1 - allWindows.append(window) - start+=1 + allWindows.sort(key=lambda windows: windows.stddev) - allWindows.sort(key=lambda windows: windows.stddev) + res = [] + first = allWindows[0] + iter = 0 - res = [] - first = allWindows[0] - iter = 0 + while iter < len(allWindows) and allWindows[iter].stddev == first.stddev: + res.append(allWindows[iter]) + iter += 1 - while iter < len(allWindows) and allWindows[iter].stddev == first.stddev: - res.append(allWindows[iter]) - iter+=1 - - return res + return res def mergeWindows(windows): - res = [] + res = [] - cur = windows[0] + cur = windows[0] - iter = 1 - constucted = cur + iter = 1 + constucted = cur - while iter < len(windows): - while iter < len(windows) and windows[iter].start == cur.start+1 and allclose(windows[iter].avg, cur.avg): - cur = windows[iter] - constucted.end=cur.end - constucted.size+=1 - iter+=1 + while iter < len(windows): + while iter < len(windows) and windows[iter].start == cur.start + 1 and allclose(windows[iter].avg, cur.avg): + cur = windows[iter] + constucted.end = cur.end + constucted.size += 1 + iter += 1 - res.append(constucted) - if iter >= len(windows): - break + res.append(constucted) + if iter >= len(windows): + break - constucted = windows[iter] - cur = windows[iter] - iter += 1 + constucted = windows[iter] + cur = windows[iter] + iter += 1 - return res + return res def generate(factors, portion): - amount = int(len(factors)*portion) - params = len(factors[0]) -1 - - keys = [i for i in range(1,params+1)] - - factors = sorted(factors, key=itemgetter(*keys), reverse=True) - result = [] - paramId = 1 - - current_windows = findWindows(factors, paramId, amount, (0,len(factors))) - - while len(current_windows) > 1 and paramId < params: - paramId += 1 - current_windows = mergeWindows(current_windows) - - new_windows = [] - for w in current_windows: - w2 = findWindows(factors, paramId, amount, (w.start, w.end+1)) - new_windows.extend(w2) - - new_windows.sort(key=lambda w: w.stddev) - - current_windows = [] - first = new_windows[0] - iter = 0 - - while iter < len(new_windows) and new_windows[iter].stddev == first.stddev : - current_windows.append(new_windows[iter]) - iter+=1 - - w = current_windows[0] - - result.extend([factors[w.start+i][0] for i in range(amount)]) - return result + amount = int(len(factors) * portion) + params = len(factors[0]) - 1 + + keys = [i for i in range(1, params + 1)] + + factors = sorted(factors, key=itemgetter(*keys), reverse=True) + result = [] + paramId = 1 + + current_windows = findWindows(factors, paramId, amount, (0, len(factors))) + + while len(current_windows) > 1 and paramId < params: + paramId += 1 + current_windows = mergeWindows(current_windows) + + new_windows = [] + for w in current_windows: + w2 = findWindows(factors, paramId, amount, (w.start, w.end + 1)) + new_windows.extend(w2) + + new_windows.sort(key=lambda w: w.stddev) + + current_windows = [] + first = new_windows[0] + iter = 0 + + while iter < len(new_windows) and new_windows[iter].stddev == first.stddev: + current_windows.append(new_windows[iter]) + iter += 1 + + w = current_windows[0] + + result.extend([factors[w.start + i][0] for i in range(amount)]) + return result diff --git a/tools/paramgen/time_select.py b/tools/paramgen/time_select.py index 0d06b7c5..78450e19 100644 --- a/tools/paramgen/time_select.py +++ b/tools/paramgen/time_select.py @@ -1,25 +1,26 @@ LAST_MONTHS = 3 START_YEAR = 2020 + class MonthYearCount: def __init__(self, month, year, count): - self.month=month - self.year=year - self.count=count + self.month = month + self.year = year + self.count = count class TimeParameter: def __init__(self, year, month, day, duration): - self.month=month - self.year=year - self.day=day - self.duration=duration - + self.month = month + self.year = year + self.day = day + self.duration = duration + -def getMedian(data, sort_key, getEntireTuple = False): +def getMedian(data, sort_key, getEntireTuple=False): if len(data) == 0: if getEntireTuple: - return MonthYearCount(0,0,0) + return MonthYearCount(0, 0, 0) return 0 if len(data) == 1: @@ -27,20 +28,20 @@ def getMedian(data, sort_key, getEntireTuple = False): return data[0] return data[0].count - srtd = sorted(data,key=sort_key) - mid = int(len(data)/2) + srtd = sorted(data, key=sort_key) + mid = int(len(data) / 2) if len(data) % 2 == 0: if getEntireTuple: return srtd[mid] - return (sort_key(srtd[mid-1]) + sort_key(srtd[mid])) / 2.0 + return (sort_key(srtd[mid - 1]) + sort_key(srtd[mid])) / 2.0 if getEntireTuple: return srtd[mid] return sort_key(srtd[mid]) -def computeTimeMedians(factors, lastmonthcount = LAST_MONTHS): +def computeTimeMedians(factors, lastmonthcount=LAST_MONTHS): mediantimes = [] lastmonths = [] firstmonths = [] @@ -48,48 +49,47 @@ def computeTimeMedians(factors, lastmonthcount = LAST_MONTHS): values.sort(key=lambda myc: (myc.year, myc.month)) l = len(values) - lastmonthsum = sum(myc.count for myc in values[max(l-lastmonthcount,0):l]) + lastmonthsum = sum(myc.count for myc in values[max(l - lastmonthcount, 0):l]) lastmonths.append(lastmonthsum) - cutoff_max = l-lastmonthcount + cutoff_max = l - lastmonthcount if cutoff_max < 0: cutoff_max = l firstmonthsum = sum(myc.count for myc in values[0:cutoff_max]) firstmonths.append(firstmonthsum) - mediantimes.append(getMedian(values,lambda myc: myc.count)) + mediantimes.append(getMedian(values, lambda myc: myc.count)) median = getMedian(mediantimes, lambda x: x) medianLastMonth = getMedian(lastmonths, lambda x: x) medianFirstMonth = getMedian(firstmonths, lambda x: x) return medianFirstMonth, medianLastMonth, median - + def getTimeParamsWithMedian(factors, medianFirstMonth, medianLastMonth, median): # strategy: find the median of the given distribution, then increase the time interval until it matches the given parameter res = [] for values in factors: - input = sorted(values,key=lambda myc: (myc.year, myc.month)) - currentMedian = getMedian(values,lambda myc: myc.count, True) + input = sorted(values, key=lambda myc: (myc.year, myc.month)) + currentMedian = getMedian(values, lambda myc: myc.count, True) if int(median) == 0 or int(currentMedian.count) == 0 or int(currentMedian.year) == 0: - res.append(TimeParameter(START_YEAR,1,1,0)) + res.append(TimeParameter(START_YEAR, 1, 1, 0)) continue if currentMedian.count > median: - duration = int(28*currentMedian.count/median) + duration = int(28 * currentMedian.count / median) res.append(TimeParameter(currentMedian.year, currentMedian.month, 1, duration)) else: - duration = int(28*median/currentMedian.count) + duration = int(28 * median / currentMedian.count) res.append(TimeParameter(currentMedian.year, currentMedian.month, 1, duration)) return res - + def findTimeParameters(factors): - medianFirstMonth, medianLastMonth, median = computeTimeMedians(factors) timeParams = getTimeParamsWithMedian(factors, medianFirstMonth, medianLastMonth, median) return timeParams - + def findTimeParams(input_loan_list, time_bucket_df): time_list = [x for x in time_bucket_df.iloc[0].index.tolist()[1:]] factors = [] @@ -103,7 +103,5 @@ def findTimeParams(input_loan_list, time_bucket_df): year = START_YEAR + month / 12 temp_factors.append(MonthYearCount(month % 12 + 1, int(year), count)) factors.append(temp_factors) - - return findTimeParameters(factors) - + return findTimeParameters(factors)