From 946fc48d753d878ead9183888550c7f0a5e604f8 Mon Sep 17 00:00:00 2001
From: Shipeng Qi <shepherdkingqsp@gmail.com>
Date: Sun, 29 Sep 2024 16:38:44 +0800
Subject: [PATCH] sort files and separate factor with datagen (#111)

* sort files

* update tools markdown doc

* seperate factor and generate

* add note
---
 scripts/run_local.sh                          |  24 ++
 .../ldbc/finbench/datagen/LdbcDatagen.scala   |  34 ++-
 .../factors/FactorGenerationStage.scala       |  11 +-
 .../datagen/generation/GenerationStage.scala  |   2 +
 tools/DataProfiler/result/db139/profile.log   |   4 -
 tools/DataProfiler/result/db177/edges.log     |   2 -
 tools/DataProfiler/result/db177/profile.log   |   4 -
 tools/DataProfiler/result/db184/profile.log   |   4 -
 tools/DataProfiler/result/transfer/edges.log  |   2 -
 .../DataProfiler/result/transfer/profile.log  |   4 -
 tools/README.md                               |  64 +++++-
 .../dataprofiler}/.gitignore                  |   0
 .../dataprofiler}/CMakeLists.txt              |   0
 .../dataprofiler}/README.md                   |   0
 .../dataprofiler}/algo.h                      |   0
 .../dataprofiler}/compile.sh                  |   0
 .../dataprofiler}/de_core.cpp                 |   0
 .../dataprofiler}/plot.py                     |   0
 .../dataprofiler}/profiler.cpp                |   0
 .../dataprofiler}/result/db139/edges.txt      |   0
 .../dataprofiler}/result/db139/in-out.txt     |   0
 .../result/db139/in_degree_dist.png           | Bin
 .../result/db139/in_degree_dist.txt           |   0
 .../db139/in_degree_dist_regression.png       | Bin
 .../db139/in_degree_dist_regression.txt       |   0
 .../dataprofiler}/result/db139/out-in.txt     |   0
 .../result/db139/out_degree_dist.png          | Bin
 .../result/db139/out_degree_dist.txt          |   0
 .../db139/out_degree_dist_regression.png      | Bin
 .../db139/out_degree_dist_regression.txt      |   0
 .../dataprofiler}/result/db177/in-out.txt     |   0
 .../result/db177/in_degree_dist.png           | Bin
 .../result/db177/in_degree_dist.txt           |   0
 .../db177/in_degree_dist_regression.png       | Bin
 .../db177/in_degree_dist_regression.txt       |   0
 .../dataprofiler}/result/db177/out-in.txt     |   0
 .../result/db177/out_degree_dist.png          | Bin
 .../result/db177/out_degree_dist.txt          |   0
 .../db177/out_degree_dist_regression.png      | Bin
 .../db177/out_degree_dist_regression.txt      |   0
 .../dataprofiler}/result/db184/edges.txt      |   0
 .../dataprofiler}/result/db184/in-out.txt     |   0
 .../result/db184/in_degree_dist.png           | Bin
 .../result/db184/in_degree_dist.txt           |   0
 .../db184/in_degree_dist_regression.png       | Bin
 .../db184/in_degree_dist_regression.txt       |   0
 .../dataprofiler}/result/db184/out-in.txt     |   0
 .../result/db184/out_degree_dist.png          | Bin
 .../result/db184/out_degree_dist.txt          |   0
 .../db184/out_degree_dist_regression.png      | Bin
 .../db184/out_degree_dist_regression.txt      |   0
 .../result/hubvertex_indeg/hub_indeg_1.png    | Bin
 .../result/hubvertex_indeg/hub_indeg_1.txt    |   0
 .../hub_indeg_1_regression.png                | Bin
 .../hub_indeg_1_regression.txt                |   0
 .../result/hubvertex_indeg/hub_indeg_2.png    | Bin
 .../result/hubvertex_indeg/hub_indeg_2.txt    |   0
 .../hub_indeg_2_regression.png                | Bin
 .../hub_indeg_2_regression.txt                |   0
 .../result/hubvertex_indeg/hub_indeg_3.png    | Bin
 .../result/hubvertex_indeg/hub_indeg_3.txt    |   0
 .../hub_indeg_3_regression.png                | Bin
 .../hub_indeg_3_regression.txt                |   0
 .../result/hubvertex_indeg/hub_indeg_4.png    | Bin
 .../result/hubvertex_indeg/hub_indeg_4.txt    |   0
 .../hub_indeg_4_regression.png                | Bin
 .../hub_indeg_4_regression.txt                |   0
 .../result/hubvertex_indeg/hub_indeg_5.png    | Bin
 .../result/hubvertex_indeg/hub_indeg_5.txt    |   0
 .../hub_indeg_5_regression.png                | Bin
 .../hub_indeg_5_regression.txt                |   0
 .../result/hubvertex_outdeg/hub_outdeg_1.png  | Bin
 .../result/hubvertex_outdeg/hub_outdeg_1.txt  |   0
 .../hub_outdeg_1_regression.png               | Bin
 .../hub_outdeg_1_regression.txt               |   0
 .../result/hubvertex_outdeg/hub_outdeg_2.png  | Bin
 .../result/hubvertex_outdeg/hub_outdeg_2.txt  |   0
 .../hub_outdeg_2_regression.png               | Bin
 .../hub_outdeg_2_regression.txt               |   0
 .../result/hubvertex_outdeg/hub_outdeg_3.png  | Bin
 .../result/hubvertex_outdeg/hub_outdeg_3.txt  |   0
 .../hub_outdeg_3_regression.png               | Bin
 .../hub_outdeg_3_regression.txt               |   0
 .../result/hubvertex_outdeg/hub_outdeg_4.png  | Bin
 .../result/hubvertex_outdeg/hub_outdeg_4.txt  |   0
 .../hub_outdeg_4_regression.png               | Bin
 .../hub_outdeg_4_regression.txt               |   0
 .../result/hubvertex_outdeg/hub_outdeg_5.png  | Bin
 .../result/hubvertex_outdeg/hub_outdeg_5.txt  |   0
 .../hub_outdeg_5_regression.png               | Bin
 .../hub_outdeg_5_regression.txt               |   0
 .../dataprofiler}/result/transfer/in-out.txt  |   0
 .../result/transfer/in_degree_dist.png        | Bin
 .../result/transfer/in_degree_dist.txt        |   0
 .../transfer/in_degree_dist_regression.png    | Bin
 .../transfer/in_degree_dist_regression.txt    |   0
 .../dataprofiler}/result/transfer/out-in.txt  |   0
 .../result/transfer/out_degree_dist.png       | Bin
 .../result/transfer/out_degree_dist.txt       |   0
 .../transfer/out_degree_dist_regression.png   | Bin
 .../transfer/out_degree_dist_regression.txt   |   0
 .../dataprofiler}/wcc_core.cpp                |   0
 .../factorgen}/factor_table.sh                |   0
 .../factorgen}/generate_account.py            |   0
 .../legacy => legacy/factorgen}/loan.py       |   0
 .../factorgen}/params_gen.properties          |   0
 .../legacy => legacy/factorgen}/params_gen.py |   0
 .../factorgen}/split_amount.py                |   0
 .../legacy => legacy/factorgen}/time_split.py |   0
 tools/{GraphGen => legacy/graphgen}/Makefile  |   0
 tools/{GraphGen => legacy/graphgen}/README.md |   0
 .../{GraphGen => legacy/graphgen}/graph_gen.c |   0
 {scripts => tools}/merge_cluster_output.py    |   0
 tools/paramgen/README.md                      |  49 ----
 tools/paramgen/parameter_curation.py          |  73 +++---
 tools/paramgen/search_params.py               | 214 +++++++++---------
 tools/paramgen/time_select.py                 |  56 +++--
 117 files changed, 281 insertions(+), 266 deletions(-)
 delete mode 100644 tools/DataProfiler/result/db139/profile.log
 delete mode 100644 tools/DataProfiler/result/db177/edges.log
 delete mode 100644 tools/DataProfiler/result/db177/profile.log
 delete mode 100644 tools/DataProfiler/result/db184/profile.log
 delete mode 100644 tools/DataProfiler/result/transfer/edges.log
 delete mode 100644 tools/DataProfiler/result/transfer/profile.log
 rename tools/{DataProfiler => legacy/dataprofiler}/.gitignore (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/CMakeLists.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/README.md (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/algo.h (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/compile.sh (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/de_core.cpp (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/plot.py (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/profiler.cpp (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/edges.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in-out.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/in_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out-in.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db139/out_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in-out.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/in_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out-in.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db177/out_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/edges.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in-out.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/in_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out-in.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/db184/out_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_1_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_2_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_3_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_4_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_indeg/hub_indeg_5_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_1_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_2_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_3_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_4_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/hubvertex_outdeg/hub_outdeg_5_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in-out.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/in_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out-in.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist_regression.png (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/result/transfer/out_degree_dist_regression.txt (100%)
 rename tools/{DataProfiler => legacy/dataprofiler}/wcc_core.cpp (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/factor_table.sh (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/generate_account.py (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/loan.py (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/params_gen.properties (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/params_gen.py (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/split_amount.py (100%)
 rename tools/{paramgen/legacy => legacy/factorgen}/time_split.py (100%)
 rename tools/{GraphGen => legacy/graphgen}/Makefile (100%)
 rename tools/{GraphGen => legacy/graphgen}/README.md (100%)
 rename tools/{GraphGen => legacy/graphgen}/graph_gen.c (100%)
 rename {scripts => tools}/merge_cluster_output.py (100%)
 delete mode 100644 tools/paramgen/README.md

diff --git a/scripts/run_local.sh b/scripts/run_local.sh
index 3185dbe5..bc01c09f 100644
--- a/scripts/run_local.sh
+++ b/scripts/run_local.sh
@@ -3,12 +3,19 @@
 LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar
 OUTPUT_DIR=out
 
+# Note: generate factor tables with --generate-factors
+
 # run locally with the python script
 # time python3 scripts/run.py --jar $LDBC_FINBENCH_DATAGEN_JAR --main-class ldbc.finbench.datagen.LdbcDatagen --memory 500g -- --scale-factor 30 --output-dir ${OUTPUT_DIR}
 
 # run locally with spark-submit command
 # **({'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005'}), # Debug
 # **({'spark.executor.extraJavaOptions': '-verbose:gc -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'}),
+# --conf "spark.memory.offHeap.enabled=true" \
+# --conf "spark.memory.offHeap.size=100g" \
+# --conf "spark.storage.memoryFraction=0" \
+# --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+
 time spark-submit --master local[*] \
     --class ldbc.finbench.datagen.LdbcDatagen \
     --driver-memory 480g \
@@ -24,3 +31,20 @@ time spark-submit --master local[*] \
     ${LDBC_FINBENCH_DATAGEN_JAR} \
     --scale-factor 10 \
     --output-dir ${OUTPUT_DIR}
+
+# currently works on SF100
+#time spark-submit --master local[*] \
+#    --class ldbc.finbench.datagen.LdbcDatagen \
+#    --driver-memory 400g \
+#    --conf "spark.default.parallelism=800" \
+#    --conf "spark.shuffle.compress=true" \
+#    --conf "spark.shuffle.spill.compress=true" \
+#    --conf "spark.kryoserializer.buffer.max=512m" \
+#    --conf "spark.driver.maxResultSize=0" \
+#    --conf "spark.driver.extraJavaOptions=-Xss512m" \
+#    --conf "spark.executor.extraJavaOptions=-Xss512m -XX:+UseG1GC" \
+#    --conf "spark.kryo.referenceTracking=false" \
+#    ${LDBC_FINBENCH_DATAGEN_JAR} \
+#    --scale-factor 100 \
+#    --output-dir ${OUTPUT_DIR}
+
diff --git a/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala b/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala
index 18d74096..16f17d7e 100644
--- a/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala
+++ b/src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala
@@ -1,9 +1,7 @@
 package ldbc.finbench.datagen
 
 import ldbc.finbench.datagen.factors.FactorGenerationStage
-import ldbc.finbench.datagen.generation.dictionary.Dictionaries
 import ldbc.finbench.datagen.generation.GenerationStage
-import ldbc.finbench.datagen.transformation.TransformationStage
 import ldbc.finbench.datagen.util.{Logging, SparkApp}
 import shapeless.lens
 
@@ -24,7 +22,7 @@ object LdbcDatagen extends SparkApp with Logging {
       format: String = "csv",
       formatOptions: Map[String, String] = Map.empty,
       epochMillis: Boolean = false,
-      generateFactors: Boolean = true,
+      generateFactors: Boolean = false,
       factorFormat: String = "parquet"
   )
 
@@ -121,22 +119,22 @@ object LdbcDatagen extends SparkApp with Logging {
   }
 
   override def run(args: ArgsType): Unit = {
-    val generationArgs = GenerationStage.Args(
-      scaleFactor = args.scaleFactor,
-      outputDir = args.outputDir,
-      format = args.format,
-      partitionsOpt = args.numPartitions
-    )
-    log.info("[Main] Starting generation stage")
-    GenerationStage.run(generationArgs)
-
-    if (args.generateFactors) {
-      val factorArgs = FactorGenerationStage.Args(
-        outputDir = args.outputDir,
-        format = args.factorFormat
+    if (!args.generateFactors) {
+      GenerationStage.run(
+        GenerationStage.Args(
+          scaleFactor = args.scaleFactor,
+          outputDir = args.outputDir,
+          format = args.format,
+          partitionsOpt = args.numPartitions
+        )
+      )
+    } else {
+      FactorGenerationStage.run(
+        FactorGenerationStage.Args(
+          outputDir = args.outputDir,
+          format = args.factorFormat
+        )
       )
-      log.info("[Main] Starting factoring stage")
-//       FactorGenerationStage.run(factorArgs)
     }
   }
 }
diff --git a/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala b/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala
index e0bf0973..993348d0 100644
--- a/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala
+++ b/src/main/scala/ldbc/finbench/datagen/factors/FactorGenerationStage.scala
@@ -1,5 +1,6 @@
 package ldbc.finbench.datagen.factors
 
+import ldbc.finbench.datagen.LdbcDatagen.log
 import ldbc.finbench.datagen.util.DatagenStage
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, SparkSession, functions => F}
@@ -10,7 +11,6 @@ import shapeless.lens
 import scala.util.matching.Regex
 
 object FactorGenerationStage extends DatagenStage {
-
   @transient lazy val log: Logger = LoggerFactory.getLogger(this.getClass)
 
   case class Args(
@@ -64,14 +64,14 @@ object FactorGenerationStage extends DatagenStage {
     run(parsedArgs)
   }
 
-  // execute factorization process
-  // TODO: finish all
+
   override def run(args: Args) = {
-    parameterCuration(args)
+    factortables(args)
   }
 
-  def parameterCuration(args: Args)(implicit spark: SparkSession) = {
+  def factortables(args: Args)(implicit spark: SparkSession) = {
     import spark.implicits._
+    log.info("[Main] Starting factoring stage")
 
     val transferRDD = spark.read
       .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
@@ -533,6 +533,5 @@ object FactorGenerationStage extends DatagenStage {
       .option("delimiter", "|")
       .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
       .save(s"${args.outputDir}/factor_table/upstream_amount")
-
   }
 }
diff --git a/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala b/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala
index 3283a7ac..72d4c6b2 100644
--- a/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala
+++ b/src/main/scala/ldbc/finbench/datagen/generation/GenerationStage.scala
@@ -1,5 +1,6 @@
 package ldbc.finbench.datagen.generation
 
+import ldbc.finbench.datagen.LdbcDatagen.log
 import ldbc.finbench.datagen.config.{ConfigParser, DatagenConfiguration}
 import ldbc.finbench.datagen.io.raw.{Csv, Parquet, RawSink}
 import ldbc.finbench.datagen.util._
@@ -25,6 +26,7 @@ object GenerationStage extends DatagenStage with Logging {
   override type ArgsType = Args
 
   override def run(args: Args): Unit = {
+    log.info("[Main] Starting generation stage")
     // build and initialize the configs
     val config = buildConfig(args)
     // OPT: It is called in each SparkGenerator in Spark to initialize the context on the executors.
diff --git a/tools/DataProfiler/result/db139/profile.log b/tools/DataProfiler/result/db139/profile.log
deleted file mode 100644
index 1ce23716..00000000
--- a/tools/DataProfiler/result/db139/profile.log
+++ /dev/null
@@ -1,4 +0,0 @@
-Run degree profiling.
-load_cost = 547.45(s)
-exec_cost = 897.82(s)
-total_cost = 1445.27(s)
diff --git a/tools/DataProfiler/result/db177/edges.log b/tools/DataProfiler/result/db177/edges.log
deleted file mode 100644
index f189d9ae..00000000
--- a/tools/DataProfiler/result/db177/edges.log
+++ /dev/null
@@ -1,2 +0,0 @@
-V 242919058, E 508196712, E/V 2.09204
-Unique edges: 2.58058e+08 / 508196712, Multiplicity: 1.96931
diff --git a/tools/DataProfiler/result/db177/profile.log b/tools/DataProfiler/result/db177/profile.log
deleted file mode 100644
index d702927a..00000000
--- a/tools/DataProfiler/result/db177/profile.log
+++ /dev/null
@@ -1,4 +0,0 @@
-Run degree profiling.
-load_cost = 418.57(s)
-exec_cost = 1180.35(s)
-total_cost = 1598.92(s)
diff --git a/tools/DataProfiler/result/db184/profile.log b/tools/DataProfiler/result/db184/profile.log
deleted file mode 100644
index 21a09254..00000000
--- a/tools/DataProfiler/result/db184/profile.log
+++ /dev/null
@@ -1,4 +0,0 @@
-Run degree profiling.
-load_cost = 377.06(s)
-exec_cost = 713.83(s)
-total_cost = 1090.90(s)
diff --git a/tools/DataProfiler/result/transfer/edges.log b/tools/DataProfiler/result/transfer/edges.log
deleted file mode 100644
index 71307300..00000000
--- a/tools/DataProfiler/result/transfer/edges.log
+++ /dev/null
@@ -1,2 +0,0 @@
-V 181123257, E 527220984, E/V 2.91084
-Unique edges: 2.52555e+08 / 527220984, Multiplicity: 2.08755
diff --git a/tools/DataProfiler/result/transfer/profile.log b/tools/DataProfiler/result/transfer/profile.log
deleted file mode 100644
index 0b7c6899..00000000
--- a/tools/DataProfiler/result/transfer/profile.log
+++ /dev/null
@@ -1,4 +0,0 @@
-Run degree profiling.
-load_cost = 399.40(s)
-exec_cost = 893.24(s)
-total_cost = 1292.64(s)
diff --git a/tools/README.md b/tools/README.md
index 352d8d7c..73177dda 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -1,6 +1,62 @@
 # Tools
 
-Here lists some tools for graph data processing.
-- dataprofiler: a tool for profiling graph data, including degree distribution, etc.
-- graphgen: a simple tool/example code to generate power-law distributed graph data.
-- paramgen: a Parameter Search tool to generate parameters for queries using TuGraph.
\ No newline at end of file
+- paramgen:
+  - parameter_curation: a tool for generating parameters for finbench queries
+- check_*.py: python scripts used for check the data features like consistency, distribution
+- merge_cluster_output.py: a python script to merge the output in cluster mode
+- statistic.py: a python script to calculate the statistics of the data
+- legacy: some legacy tools
+  - dataprofiler: a tool for profiling graph data, including degree distribution, etc.
+  - graphgen: a simple tool/example code to generate power-law distributed graph data.
+  - factorgen: factor table generators in python version
+
+
+## ParamsGen
+
+`params_gen.py` uses the CREATE_VALIDATION feature to generate parameters.
+
+The specific steps are as follows:
+
+1. Select vertices of type Account, Person, and Loan from the dataset, and generate a parameter file that meets the input specifications for ldbc_finbench_driver.
+2. Execute CREATE_VALIDATION to generate validation_params.csv.
+3. Select non-empty results from validation_params.csv.
+
+Example:
+
+```bash
+python3 params_gen.py 1 # gen tcr1 params
+```
+
+Other notes:
+
+1. The generated start_timestamp and end_timestamp in the current version are fixed values.
+2. For tcr4 and tcr10, this method is not efficient enough. Use the following Cypher query to search for parameters:
+
+```Cypher
+// tcr4
+MATCH
+    (n1:Account)-[:transfer]->
+    (n2:Account)-[:transfer]->
+    (n3:Account)-[:transfer]->(n4:Account)
+WHERE
+    n1.id = n4.id AND n1.id > n2.id AND n2.id > n3.id
+WITH
+	  n1.id as n1id,
+    n2.id as n2id,
+    n3.id as n3id,
+    n4.id as n4id
+LIMIT 1000
+RETURN DISTINCT toString(n1id)+"|"+toString(n2id)
+
+// tcr10
+MATCH
+    (c:Company)<-[:invest]-(p:Person)
+WITH
+	  c.id as cid,
+    count(p.id) as num,
+		collect(p.id) as person
+WHERE num >= 2
+RETURN
+    tostring(person[0])+"|"+tostring(person[1])
+LIMIT 1000
+```
diff --git a/tools/DataProfiler/.gitignore b/tools/legacy/dataprofiler/.gitignore
similarity index 100%
rename from tools/DataProfiler/.gitignore
rename to tools/legacy/dataprofiler/.gitignore
diff --git a/tools/DataProfiler/CMakeLists.txt b/tools/legacy/dataprofiler/CMakeLists.txt
similarity index 100%
rename from tools/DataProfiler/CMakeLists.txt
rename to tools/legacy/dataprofiler/CMakeLists.txt
diff --git a/tools/DataProfiler/README.md b/tools/legacy/dataprofiler/README.md
similarity index 100%
rename from tools/DataProfiler/README.md
rename to tools/legacy/dataprofiler/README.md
diff --git a/tools/DataProfiler/algo.h b/tools/legacy/dataprofiler/algo.h
similarity index 100%
rename from tools/DataProfiler/algo.h
rename to tools/legacy/dataprofiler/algo.h
diff --git a/tools/DataProfiler/compile.sh b/tools/legacy/dataprofiler/compile.sh
similarity index 100%
rename from tools/DataProfiler/compile.sh
rename to tools/legacy/dataprofiler/compile.sh
diff --git a/tools/DataProfiler/de_core.cpp b/tools/legacy/dataprofiler/de_core.cpp
similarity index 100%
rename from tools/DataProfiler/de_core.cpp
rename to tools/legacy/dataprofiler/de_core.cpp
diff --git a/tools/DataProfiler/plot.py b/tools/legacy/dataprofiler/plot.py
similarity index 100%
rename from tools/DataProfiler/plot.py
rename to tools/legacy/dataprofiler/plot.py
diff --git a/tools/DataProfiler/profiler.cpp b/tools/legacy/dataprofiler/profiler.cpp
similarity index 100%
rename from tools/DataProfiler/profiler.cpp
rename to tools/legacy/dataprofiler/profiler.cpp
diff --git a/tools/DataProfiler/result/db139/edges.txt b/tools/legacy/dataprofiler/result/db139/edges.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/edges.txt
rename to tools/legacy/dataprofiler/result/db139/edges.txt
diff --git a/tools/DataProfiler/result/db139/in-out.txt b/tools/legacy/dataprofiler/result/db139/in-out.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/in-out.txt
rename to tools/legacy/dataprofiler/result/db139/in-out.txt
diff --git a/tools/DataProfiler/result/db139/in_degree_dist.png b/tools/legacy/dataprofiler/result/db139/in_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/db139/in_degree_dist.png
rename to tools/legacy/dataprofiler/result/db139/in_degree_dist.png
diff --git a/tools/DataProfiler/result/db139/in_degree_dist.txt b/tools/legacy/dataprofiler/result/db139/in_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/in_degree_dist.txt
rename to tools/legacy/dataprofiler/result/db139/in_degree_dist.txt
diff --git a/tools/DataProfiler/result/db139/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/db139/in_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/db139/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/in_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/db139/out-in.txt b/tools/legacy/dataprofiler/result/db139/out-in.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/out-in.txt
rename to tools/legacy/dataprofiler/result/db139/out-in.txt
diff --git a/tools/DataProfiler/result/db139/out_degree_dist.png b/tools/legacy/dataprofiler/result/db139/out_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/db139/out_degree_dist.png
rename to tools/legacy/dataprofiler/result/db139/out_degree_dist.png
diff --git a/tools/DataProfiler/result/db139/out_degree_dist.txt b/tools/legacy/dataprofiler/result/db139/out_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/out_degree_dist.txt
rename to tools/legacy/dataprofiler/result/db139/out_degree_dist.txt
diff --git a/tools/DataProfiler/result/db139/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/db139/out_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/db139/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/db139/out_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/db177/in-out.txt b/tools/legacy/dataprofiler/result/db177/in-out.txt
similarity index 100%
rename from tools/DataProfiler/result/db177/in-out.txt
rename to tools/legacy/dataprofiler/result/db177/in-out.txt
diff --git a/tools/DataProfiler/result/db177/in_degree_dist.png b/tools/legacy/dataprofiler/result/db177/in_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/db177/in_degree_dist.png
rename to tools/legacy/dataprofiler/result/db177/in_degree_dist.png
diff --git a/tools/DataProfiler/result/db177/in_degree_dist.txt b/tools/legacy/dataprofiler/result/db177/in_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/db177/in_degree_dist.txt
rename to tools/legacy/dataprofiler/result/db177/in_degree_dist.txt
diff --git a/tools/DataProfiler/result/db177/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/db177/in_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/db177/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/db177/in_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/db177/out-in.txt b/tools/legacy/dataprofiler/result/db177/out-in.txt
similarity index 100%
rename from tools/DataProfiler/result/db177/out-in.txt
rename to tools/legacy/dataprofiler/result/db177/out-in.txt
diff --git a/tools/DataProfiler/result/db177/out_degree_dist.png b/tools/legacy/dataprofiler/result/db177/out_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/db177/out_degree_dist.png
rename to tools/legacy/dataprofiler/result/db177/out_degree_dist.png
diff --git a/tools/DataProfiler/result/db177/out_degree_dist.txt b/tools/legacy/dataprofiler/result/db177/out_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/db177/out_degree_dist.txt
rename to tools/legacy/dataprofiler/result/db177/out_degree_dist.txt
diff --git a/tools/DataProfiler/result/db177/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/db177/out_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/db177/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/db177/out_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/db184/edges.txt b/tools/legacy/dataprofiler/result/db184/edges.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/edges.txt
rename to tools/legacy/dataprofiler/result/db184/edges.txt
diff --git a/tools/DataProfiler/result/db184/in-out.txt b/tools/legacy/dataprofiler/result/db184/in-out.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/in-out.txt
rename to tools/legacy/dataprofiler/result/db184/in-out.txt
diff --git a/tools/DataProfiler/result/db184/in_degree_dist.png b/tools/legacy/dataprofiler/result/db184/in_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/db184/in_degree_dist.png
rename to tools/legacy/dataprofiler/result/db184/in_degree_dist.png
diff --git a/tools/DataProfiler/result/db184/in_degree_dist.txt b/tools/legacy/dataprofiler/result/db184/in_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/in_degree_dist.txt
rename to tools/legacy/dataprofiler/result/db184/in_degree_dist.txt
diff --git a/tools/DataProfiler/result/db184/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/db184/in_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/db184/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/in_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/db184/out-in.txt b/tools/legacy/dataprofiler/result/db184/out-in.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/out-in.txt
rename to tools/legacy/dataprofiler/result/db184/out-in.txt
diff --git a/tools/DataProfiler/result/db184/out_degree_dist.png b/tools/legacy/dataprofiler/result/db184/out_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/db184/out_degree_dist.png
rename to tools/legacy/dataprofiler/result/db184/out_degree_dist.png
diff --git a/tools/DataProfiler/result/db184/out_degree_dist.txt b/tools/legacy/dataprofiler/result/db184/out_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/out_degree_dist.txt
rename to tools/legacy/dataprofiler/result/db184/out_degree_dist.txt
diff --git a/tools/DataProfiler/result/db184/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/db184/out_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/db184/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/db184/out_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_1_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_2_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_3_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_4_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.txt
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.png b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_indeg/hub_indeg_5_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.txt
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png
diff --git a/tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt b/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt
rename to tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt
diff --git a/tools/DataProfiler/result/transfer/in-out.txt b/tools/legacy/dataprofiler/result/transfer/in-out.txt
similarity index 100%
rename from tools/DataProfiler/result/transfer/in-out.txt
rename to tools/legacy/dataprofiler/result/transfer/in-out.txt
diff --git a/tools/DataProfiler/result/transfer/in_degree_dist.png b/tools/legacy/dataprofiler/result/transfer/in_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/transfer/in_degree_dist.png
rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist.png
diff --git a/tools/DataProfiler/result/transfer/in_degree_dist.txt b/tools/legacy/dataprofiler/result/transfer/in_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/transfer/in_degree_dist.txt
rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist.txt
diff --git a/tools/DataProfiler/result/transfer/in_degree_dist_regression.png b/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/transfer/in_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/transfer/in_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/transfer/in_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.txt
diff --git a/tools/DataProfiler/result/transfer/out-in.txt b/tools/legacy/dataprofiler/result/transfer/out-in.txt
similarity index 100%
rename from tools/DataProfiler/result/transfer/out-in.txt
rename to tools/legacy/dataprofiler/result/transfer/out-in.txt
diff --git a/tools/DataProfiler/result/transfer/out_degree_dist.png b/tools/legacy/dataprofiler/result/transfer/out_degree_dist.png
similarity index 100%
rename from tools/DataProfiler/result/transfer/out_degree_dist.png
rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist.png
diff --git a/tools/DataProfiler/result/transfer/out_degree_dist.txt b/tools/legacy/dataprofiler/result/transfer/out_degree_dist.txt
similarity index 100%
rename from tools/DataProfiler/result/transfer/out_degree_dist.txt
rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist.txt
diff --git a/tools/DataProfiler/result/transfer/out_degree_dist_regression.png b/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png
similarity index 100%
rename from tools/DataProfiler/result/transfer/out_degree_dist_regression.png
rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png
diff --git a/tools/DataProfiler/result/transfer/out_degree_dist_regression.txt b/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.txt
similarity index 100%
rename from tools/DataProfiler/result/transfer/out_degree_dist_regression.txt
rename to tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.txt
diff --git a/tools/DataProfiler/wcc_core.cpp b/tools/legacy/dataprofiler/wcc_core.cpp
similarity index 100%
rename from tools/DataProfiler/wcc_core.cpp
rename to tools/legacy/dataprofiler/wcc_core.cpp
diff --git a/tools/paramgen/legacy/factor_table.sh b/tools/legacy/factorgen/factor_table.sh
similarity index 100%
rename from tools/paramgen/legacy/factor_table.sh
rename to tools/legacy/factorgen/factor_table.sh
diff --git a/tools/paramgen/legacy/generate_account.py b/tools/legacy/factorgen/generate_account.py
similarity index 100%
rename from tools/paramgen/legacy/generate_account.py
rename to tools/legacy/factorgen/generate_account.py
diff --git a/tools/paramgen/legacy/loan.py b/tools/legacy/factorgen/loan.py
similarity index 100%
rename from tools/paramgen/legacy/loan.py
rename to tools/legacy/factorgen/loan.py
diff --git a/tools/paramgen/legacy/params_gen.properties b/tools/legacy/factorgen/params_gen.properties
similarity index 100%
rename from tools/paramgen/legacy/params_gen.properties
rename to tools/legacy/factorgen/params_gen.properties
diff --git a/tools/paramgen/legacy/params_gen.py b/tools/legacy/factorgen/params_gen.py
similarity index 100%
rename from tools/paramgen/legacy/params_gen.py
rename to tools/legacy/factorgen/params_gen.py
diff --git a/tools/paramgen/legacy/split_amount.py b/tools/legacy/factorgen/split_amount.py
similarity index 100%
rename from tools/paramgen/legacy/split_amount.py
rename to tools/legacy/factorgen/split_amount.py
diff --git a/tools/paramgen/legacy/time_split.py b/tools/legacy/factorgen/time_split.py
similarity index 100%
rename from tools/paramgen/legacy/time_split.py
rename to tools/legacy/factorgen/time_split.py
diff --git a/tools/GraphGen/Makefile b/tools/legacy/graphgen/Makefile
similarity index 100%
rename from tools/GraphGen/Makefile
rename to tools/legacy/graphgen/Makefile
diff --git a/tools/GraphGen/README.md b/tools/legacy/graphgen/README.md
similarity index 100%
rename from tools/GraphGen/README.md
rename to tools/legacy/graphgen/README.md
diff --git a/tools/GraphGen/graph_gen.c b/tools/legacy/graphgen/graph_gen.c
similarity index 100%
rename from tools/GraphGen/graph_gen.c
rename to tools/legacy/graphgen/graph_gen.c
diff --git a/scripts/merge_cluster_output.py b/tools/merge_cluster_output.py
similarity index 100%
rename from scripts/merge_cluster_output.py
rename to tools/merge_cluster_output.py
diff --git a/tools/paramgen/README.md b/tools/paramgen/README.md
deleted file mode 100644
index 41d0e732..00000000
--- a/tools/paramgen/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-## ParamsGen
-
-`params_gen.py` uses the CREATE_VALIDATION feature to generate parameters. 
-
-The specific steps are as follows:
-
-1. Select vertices of type Account, Person, and Loan from the dataset, and generate a parameter file that meets the input specifications for ldbc_finbench_driver.
-2. Execute CREATE_VALIDATION to generate validation_params.csv.
-3. Select non-empty results from validation_params.csv.
-
-Example:
-
-```bash
-python3 params_gen.py 1 # gen tcr1 params
-```
-
-Other notes:
-
-1. The generated start_timestamp and end_timestamp in the current version are fixed values.
-2. For tcr4 and tcr10, this method is not efficient enough. Use the following Cypher query to search for parameters:
-
-```Cypher
-// tcr4
-MATCH
-    (n1:Account)-[:transfer]->
-    (n2:Account)-[:transfer]->
-    (n3:Account)-[:transfer]->(n4:Account)
-WHERE
-    n1.id = n4.id AND n1.id > n2.id AND n2.id > n3.id
-WITH
-	  n1.id as n1id,
-    n2.id as n2id,
-    n3.id as n3id,
-    n4.id as n4id
-LIMIT 1000
-RETURN DISTINCT toString(n1id)+"|"+toString(n2id)
-
-// tcr10
-MATCH
-    (c:Company)<-[:invest]-(p:Person)
-WITH
-	  c.id as cid,
-    count(p.id) as num,
-		collect(p.id) as person
-WHERE num >= 2
-RETURN
-    tostring(person[0])+"|"+tostring(person[1])
-LIMIT 1000
-```
diff --git a/tools/paramgen/parameter_curation.py b/tools/paramgen/parameter_curation.py
index 7c92e6f2..6a4de363 100644
--- a/tools/paramgen/parameter_curation.py
+++ b/tools/paramgen/parameter_curation.py
@@ -17,6 +17,10 @@
 from glob import glob
 import concurrent.futures
 from functools import partial
+import sys
+
+table_dir = sys.argv[1]
+out_dir = sys.argv[2]
 
 THRESH_HOLD = 0
 THRESH_HOLD_6 = 0
@@ -29,20 +33,16 @@
 else:
     TRUNCATION_ORDER = "AMOUNT_DESCENDING"
 
-table_dir = '../../out/factor_table'
-out_dir = '../../out/substitute_parameters/'
 
 def process_csv(file_path):
     all_files = glob(file_path + '/*.csv')
-    
     df_list = []
-    
+
     for filename in all_files:
         df = pd.read_csv(filename, delimiter='|')
         df_list.append(df)
-    
+
     combined_df = pd.concat(df_list, ignore_index=True)
-    
     return combined_df
 
 
@@ -52,7 +52,7 @@ def __init__(self):
         self.inputs = []
 
     def setOutputFile(self, outputFile):
-        self.outputFile=outputFile
+        self.outputFile = outputFile
 
     def registerHandler(self, handler, inputParams, header):
         handler.header = header
@@ -63,7 +63,7 @@ def writeCSV(self):
         dir_path = os.path.dirname(self.outputFile)
         if not os.path.exists(dir_path):
             os.makedirs(dir_path)
-        output = codecs.open( self.outputFile, "w",encoding="utf-8")
+        output = codecs.open(self.outputFile, "w", encoding="utf-8")
 
         if len(self.inputs) == 0:
             return
@@ -105,7 +105,8 @@ def find_neighbors(account_list, account_account_df, account_amount_df, amount_b
                 transfer_in_amount = account_amount_df.loc[item]['amount']
             except KeyError:
                 transfer_in_amount = 0
-            result.update(neighbors_with_truncate_threshold(transfer_in_amount, rows_account_list, rows_amount_bucket, num_list))
+            result.update(
+                neighbors_with_truncate_threshold(transfer_in_amount, rows_account_list, rows_amount_bucket, num_list))
 
     elif query_id in [1, 2, 5]:
         for item in account_list:
@@ -131,7 +132,6 @@ def find_neighbors(account_list, account_account_df, account_amount_df, amount_b
 
 
 def filter_neighbors(account_list, amount_bucket_df, num_list, account_id):
-
     # print(f'account_list: {account_list}')
 
     rows_amount_bucket = amount_bucket_df.loc[account_id]
@@ -144,20 +144,20 @@ def filter_neighbors(account_list, amount_bucket_df, num_list, account_id):
             header_at_limit = int(col)
             break
 
-    partial_apply = partial(filter_neighbors_with_truncate_threshold, amount_bucket_df=amount_bucket_df, num_list=num_list, header_at_limit=header_at_limit)
+    partial_apply = partial(filter_neighbors_with_truncate_threshold, amount_bucket_df=amount_bucket_df,
+                            num_list=num_list, header_at_limit=header_at_limit)
     account_mapped = map(partial_apply, account_list)
     result = [x for x in account_mapped if x is not None]
     return result
 
 
 def filter_neighbors_with_truncate_threshold(item, amount_bucket_df, num_list, header_at_limit):
-    
     if item[1] > THRESH_HOLD_6:
         if header_at_limit == -1:
             return item[0]
         if (TIME_TRUNCATE and item[2] >= header_at_limit) or (not TIME_TRUNCATE and item[1] >= header_at_limit):
             return item[0]
-    
+
     return None
 
 
@@ -183,7 +183,7 @@ def neighbors_with_truncate_threshold(transfer_in_amount, rows_account_list, row
             return [t[0] for t in temp if t[1] >= header_at_limit]
     else:
         return [t[0] for t in temp]
-    
+
 
 def neighbors_with_trancate(rows_account_list, rows_amount_bucket, num_list):
     if rows_amount_bucket is None:
@@ -215,7 +215,6 @@ def process_get_neighbors(chunk, account_account_df, account_amount_df, amount_b
 
 
 def process_filter_neighbors(chunk, amount_bucket_df, num_list):
-
     first_column_name = chunk.columns[0]
     second_column_name = chunk.columns[1]
 
@@ -227,7 +226,6 @@ def process_filter_neighbors(chunk, amount_bucket_df, num_list):
 
 
 def get_next_neighbor_list(neighbors_df, account_account_df, account_amount_df, amount_bucket_df, query_id):
-
     num_list = []
     if query_id != 3 and query_id != 11:
         num_list = [x for x in amount_bucket_df.columns.tolist()]
@@ -237,10 +235,12 @@ def get_next_neighbor_list(neighbors_df, account_account_df, account_amount_df,
     chunks = np.array_split(neighbors_df, query_parallelism)
 
     with concurrent.futures.ProcessPoolExecutor(max_workers=query_parallelism) as executor:
-        futures = [executor.submit(process_get_neighbors, chunk, account_account_df, account_amount_df, amount_bucket_df, num_list, query_id) for chunk in chunks]
+        futures = [
+            executor.submit(process_get_neighbors, chunk, account_account_df, account_amount_df, amount_bucket_df,
+                            num_list, query_id) for chunk in chunks]
         results = [future.result() for future in concurrent.futures.as_completed(futures)]
         executor.shutdown(wait=True)
-    
+
     next_neighbors_df = pd.concat(results)
     next_neighbors_df = next_neighbors_df.sort_index()
     return next_neighbors_df
@@ -271,16 +271,18 @@ def process_batch(batch, basic_sum_df, first_column_name, second_column_name):
     ).drop(columns=[second_column_name])
     return merged_df.groupby(first_column_name).sum()
 
+
 def get_next_sum_table(neighbors_df, basic_sum_df, batch_size=BATCH_SIZE):
     first_column_name = neighbors_df.columns[0]
     second_column_name = neighbors_df.columns[1]
 
     batches = [neighbors_df.iloc[start:start + batch_size] for start in range(0, len(neighbors_df), batch_size)]
-    
+
     result_list = []
     query_parallelism = max(1, multiprocessing.cpu_count() // 4)
     with concurrent.futures.ProcessPoolExecutor(max_workers=query_parallelism) as executor:
-        futures = [executor.submit(process_batch, batch, basic_sum_df, first_column_name, second_column_name) for batch in batches]
+        futures = [executor.submit(process_batch, batch, basic_sum_df, first_column_name, second_column_name) for batch
+                   in batches]
         for future in futures:
             result_list.append(future.result())
         executor.shutdown(wait=True)
@@ -293,28 +295,32 @@ def get_next_sum_table(neighbors_df, basic_sum_df, batch_size=BATCH_SIZE):
 def handleThresholdParam(threshold):
     return str(threshold)
 
+
 def handleThreshold2Param(threshold2):
     return str(threshold2)
 
+
 def hendleIdParam(id):
     return str(id)
 
+
 def handleTruncateLimitParam(truncateLimit):
     return str(truncateLimit)
 
+
 def handleTruncateOrderParam(truncateOrder):
     return truncateOrder
 
 
 def handleTimeDurationParam(timeParam):
-    start = timegm(date(year=int(timeParam.year), month=int(timeParam.month), day=int(timeParam.day)).timetuple())*1000
+    start = timegm(
+        date(year=int(timeParam.year), month=int(timeParam.month), day=int(timeParam.day)).timetuple()) * 1000
     end = start + timeParam.duration * 3600 * 24 * 1000
     res = str(start) + "|" + str(end)
     return res
 
 
 def process_query(query_id):
-
     if query_id in [7, 10]:
         process_1_hop_query(query_id)
     elif query_id in [1, 2, 3, 5, 8, 11]:
@@ -324,7 +330,6 @@ def process_query(query_id):
 
 
 def process_iter_queries(query_id):
-
     upstream_amount_df = None
     upstream_amount_path = None
     amount_bucket_path = None
@@ -392,7 +397,6 @@ def process_iter_queries(query_id):
         output_path = os.path.join(out_dir, 'tcr11.txt')
         steps = 3
 
-
     first_account_df = process_csv(first_account_path)
     account_account_df = process_csv(account_account_path)
     if query_id == 8:
@@ -407,7 +411,7 @@ def process_iter_queries(query_id):
     first_account_name = account_account_df.columns[0]
     second_account_name = account_account_df.columns[1]
     first_amount_name = amount_bucket_df.columns[0]
-    first_time_name = time_bucket_df.columns[0]  
+    first_time_name = time_bucket_df.columns[0]
 
     account_account_df[second_account_name] = account_account_df[second_account_name].apply(literal_eval)
     first_account_df[second_column_name] = first_account_df[second_column_name].apply(literal_eval)
@@ -434,7 +438,8 @@ def process_iter_queries(query_id):
         if current_step == steps - 1:
             next_time_bucket = get_next_sum_table(first_neighbors_df, time_bucket_df)
         else:
-            first_neighbors_df = get_next_neighbor_list(first_neighbors_df, account_account_df, upstream_amount_df, amount_bucket_df, query_id)
+            first_neighbors_df = get_next_neighbor_list(first_neighbors_df, account_account_df, upstream_amount_df,
+                                                        amount_bucket_df, query_id)
 
         current_step += 1
 
@@ -506,13 +511,13 @@ def process_iter_queries(query_id):
             j = 0
             k = 0
             while True:
-                j = random.randint(0, len(final_first_items)-1)
-                k = random.randint(0, len(final_first_items)-1)
+                j = random.randint(0, len(final_first_items) - 1)
+                k = random.randint(0, len(final_first_items) - 1)
                 if final_first_items[j] != account_id and final_first_items[k] != account_id:
                     break
             final_second_items_3.append(final_first_items[j])
             final_second_items_4.append(final_first_items[k])
-        
+
         csvWriter_3 = CSVSerializer()
         csvWriter_3.setOutputFile(output_path + 'tcr3.txt')
         csvWriter_3.registerHandler(hendleIdParam, final_first_items, "id1")
@@ -520,7 +525,7 @@ def process_iter_queries(query_id):
         csvWriter_3.registerHandler(handleTimeDurationParam, time_list, "startTime|endTime")
         csvWriter_3.registerHandler(handleTruncateLimitParam, truncate_limit_list, "truncationLimit")
         csvWriter_3.registerHandler(handleTruncateOrderParam, truncate_order_list, "truncationOrder")
-        
+
         csvWriter_4 = CSVSerializer()
         csvWriter_4.setOutputFile(output_path + 'tcr4.txt')
         csvWriter_4.registerHandler(hendleIdParam, final_first_items, "id1")
@@ -547,7 +552,6 @@ def process_iter_queries(query_id):
 
 
 def process_1_hop_query(query_id):
-
     if query_id == 7:
         first_count_path = os.path.join(table_dir, 'account_in_out_count')
         time_bucket_path = os.path.join(table_dir, 'account_in_out_month')
@@ -594,13 +598,13 @@ def process_1_hop_query(query_id):
         csvWriter_9.writeCSV()
 
         print(f'query_id 7 and 9 finished')
-    
+
     elif query_id == 10:
         final_second_items = []
         for person in final_first_items:
             j = 0
             while True:
-                j = random.randint(0, len(final_first_items)-1)
+                j = random.randint(0, len(final_first_items) - 1)
                 if final_first_items[j] != person:
                     break
             final_second_items.append(final_first_items[j])
@@ -616,7 +620,6 @@ def process_1_hop_query(query_id):
 
 
 def process_withdraw_query():
-
     first_account_path = os.path.join(table_dir, 'account_withdraw_in_items')
     time_bucket_path = os.path.join(table_dir, 'transfer_in_month')
     if TIME_TRUNCATE:
@@ -640,7 +643,7 @@ def process_withdraw_query():
     withdraw_bucket_df.set_index(withdraw_first_name, inplace=True)
     transfer_bucket_df.set_index(transfer_first_name, inplace=True)
     time_bucket_df.set_index(time_first_name, inplace=True)
-    
+
     first_account_df[second_column_name] = first_account_df[second_column_name].apply(literal_eval)
     first_neighbors_df = first_account_df.sort_values(by=first_column_name)
 
diff --git a/tools/paramgen/search_params.py b/tools/paramgen/search_params.py
index 722cbaf2..3d104270 100644
--- a/tools/paramgen/search_params.py
+++ b/tools/paramgen/search_params.py
@@ -3,145 +3,149 @@
 
 
 def allclose(a, b, rtol=1e-05, atol=1e-08):
-	return abs(a - b) <= (atol + rtol * abs(b))
+    return abs(a - b) <= (atol + rtol * abs(b))
+
 
 def readFactors(f):
-	res = []
-	for line in f.readlines():
-		values = [item if index == 0  else int(item) for (index, item)  in enumerate(line.split(","))]
-		res.append(values)
+    res = []
+    for line in f.readlines():
+        values = [item if index == 0 else int(item) for (index, item) in enumerate(line.split(","))]
+        res.append(values)
+
+    return res
 
-	return res
 
 class Window:
-	def __init__(self, paramId, start, end):
-		self.param = paramId
-		self.start = start
-		self.end = end
-		self.avg = 0.0
-		self.stddev = 0.0
-		self.size = end-start+1
-
-	def __str__(self):
-		res = "[%d, %d] "%(self.start, self.end)
-		res += "size: %d, avg: %0.2f, stddev: %0.2f" % (self.size, self.avg, self.stddev) 
-		return res
+    def __init__(self, paramId, start, end):
+        self.param = paramId
+        self.start = start
+        self.end = end
+        self.avg = 0.0
+        self.stddev = 0.0
+        self.size = end - start + 1
+
+    def __str__(self):
+        res = "[%d, %d] " % (self.start, self.end)
+        res += "size: %d, avg: %0.2f, stddev: %0.2f" % (self.size, self.avg, self.stddev)
+        return res
+
 
 def getAverageCost(rows, key):
-	return float(sum([key(r) for r in rows])) / len(rows)
+    return float(sum([key(r) for r in rows])) / len(rows)
+
 
 def getCostStdDev(rows, avg, key):
-	return math.sqrt(sum([math.pow(key(r)-avg,2) for r in rows]) / len(rows))
+    return math.sqrt(sum([math.pow(key(r) - avg, 2) for r in rows]) / len(rows))
+
 
 def updateAverageCost(avg, oldelem, newelem, samplesize):
-	return avg + (newelem - oldelem) / samplesize
+    return avg + (newelem - oldelem) / samplesize
 
 
 def findWindows(factors, param, amount, bounds):
+    data = factors[bounds[0]: bounds[1]]
+    allWindows = []
+    start = 0
 
-	data = factors[bounds[0]: bounds[1]]
-	allWindows = []
-	start = 0
+    initWindow = Window(param, start, amount - 1)
+    initWindow.avg = getAverageCost(data[start:amount], itemgetter(param))
+    initWindow.stddev = getCostStdDev(data[start:amount], initWindow.avg, itemgetter(param))
 
-	initWindow = Window(param, start, amount-1)
-	initWindow.avg = getAverageCost(data[start:amount], itemgetter(param))
-	initWindow.stddev = getCostStdDev(data[start:amount], initWindow.avg, itemgetter(param))
+    s1 = sum([x[param] for x in data[start:start + amount]])
+    s2 = sum([x[param] * x[param] for x in data[start:start + amount]])
+    start += 1
+    allWindows.append(initWindow)
 
-	s1 = sum([x[param] for x in data[start:start+amount]])
-	s2 = sum([x[param]*x[param] for x in data[start:start+amount]])	
-	start += 1
-	allWindows.append(initWindow)
+    while start + amount < len(data):
+        end = start + amount
+        if data[end - 1][param] < 10:
+            break
 
-	while start + amount  < len(data):
-		end = start + amount 
-		if data[end-1][param]<10:
-			break
+        window = Window(param, bounds[0] + start, bounds[0] + end - 1)
 
-		window = Window(param, bounds[0]+start, bounds[0]+end-1)
+        # update the streaming stats about avg and stddev
+        s1 -= data[start - 1][param]
+        s1 += data[end - 1][param]
+        s2 -= data[start - 1][param] * data[start - 1][param]
+        s2 += data[end - 1][param] * data[end - 1][param]
 
-		# update the streaming stats about avg and stddev
-		s1 -= data[start-1][param]
-		s1 += data[end-1][param]
-		s2 -= data[start-1][param]*data[start-1][param]
-		s2 += data[end-1][param]*data[end-1][param]
+        window.avg = float(s1) / amount
+        window.stddev = math.sqrt(float(amount * s2 - s1 * s1)) / amount
 
-		window.avg = float(s1) / amount
-		window.stddev = math.sqrt(float(amount*s2 - s1*s1))/amount
+        allWindows.append(window)
+        start += 1
 
-		allWindows.append(window)
-		start+=1
+    allWindows.sort(key=lambda windows: windows.stddev)
 
-	allWindows.sort(key=lambda windows: windows.stddev)
+    res = []
+    first = allWindows[0]
+    iter = 0
 
-	res = []
-	first = allWindows[0]
-	iter = 0
+    while iter < len(allWindows) and allWindows[iter].stddev == first.stddev:
+        res.append(allWindows[iter])
+        iter += 1
 
-	while iter < len(allWindows) and allWindows[iter].stddev == first.stddev:
-		res.append(allWindows[iter])
-		iter+=1
-		
-	return res
+    return res
 
 
 def mergeWindows(windows):
-	res = []
+    res = []
 
-	cur = windows[0]
+    cur = windows[0]
 
-	iter = 1
-	constucted = cur
+    iter = 1
+    constucted = cur
 
-	while iter < len(windows):
-		while iter < len(windows) and windows[iter].start == cur.start+1 and allclose(windows[iter].avg, cur.avg):
-			cur = windows[iter]
-			constucted.end=cur.end
-			constucted.size+=1
-			iter+=1
+    while iter < len(windows):
+        while iter < len(windows) and windows[iter].start == cur.start + 1 and allclose(windows[iter].avg, cur.avg):
+            cur = windows[iter]
+            constucted.end = cur.end
+            constucted.size += 1
+            iter += 1
 
-		res.append(constucted)
-		if iter >= len(windows):
-			break
+        res.append(constucted)
+        if iter >= len(windows):
+            break
 
-		constucted = windows[iter]
-		cur = windows[iter]
-		iter += 1
+        constucted = windows[iter]
+        cur = windows[iter]
+        iter += 1
 
-	return res
+    return res
 
 
 def generate(factors, portion):
-	amount = int(len(factors)*portion)
-	params = len(factors[0]) -1
-
-	keys = [i for i in range(1,params+1)]
-	
-	factors = sorted(factors, key=itemgetter(*keys), reverse=True)
-	result = []
-	paramId = 1
-
-	current_windows = findWindows(factors, paramId, amount, (0,len(factors)))
-
-	while len(current_windows) > 1 and paramId < params:
-		paramId += 1
-		current_windows = mergeWindows(current_windows)
-		
-		new_windows = []
-		for w in current_windows:
-			w2 = findWindows(factors, paramId, amount, (w.start, w.end+1))
-			new_windows.extend(w2)
-
-		new_windows.sort(key=lambda w: w.stddev)
-		
-		current_windows = []
-		first = new_windows[0]
-		iter = 0
-
-		while iter < len(new_windows) and new_windows[iter].stddev == first.stddev :
-			current_windows.append(new_windows[iter])
-			iter+=1
-	
-	w = current_windows[0]
-
-	result.extend([factors[w.start+i][0] for i in range(amount)])
-	return result
+    amount = int(len(factors) * portion)
+    params = len(factors[0]) - 1
+
+    keys = [i for i in range(1, params + 1)]
+
+    factors = sorted(factors, key=itemgetter(*keys), reverse=True)
+    result = []
+    paramId = 1
+
+    current_windows = findWindows(factors, paramId, amount, (0, len(factors)))
+
+    while len(current_windows) > 1 and paramId < params:
+        paramId += 1
+        current_windows = mergeWindows(current_windows)
+
+        new_windows = []
+        for w in current_windows:
+            w2 = findWindows(factors, paramId, amount, (w.start, w.end + 1))
+            new_windows.extend(w2)
+
+        new_windows.sort(key=lambda w: w.stddev)
+
+        current_windows = []
+        first = new_windows[0]
+        iter = 0
+
+        while iter < len(new_windows) and new_windows[iter].stddev == first.stddev:
+            current_windows.append(new_windows[iter])
+            iter += 1
+
+    w = current_windows[0]
+
+    result.extend([factors[w.start + i][0] for i in range(amount)])
+    return result
diff --git a/tools/paramgen/time_select.py b/tools/paramgen/time_select.py
index 0d06b7c5..78450e19 100644
--- a/tools/paramgen/time_select.py
+++ b/tools/paramgen/time_select.py
@@ -1,25 +1,26 @@
 LAST_MONTHS = 3
 START_YEAR = 2020
 
+
 class MonthYearCount:
     def __init__(self, month, year, count):
-        self.month=month
-        self.year=year
-        self.count=count
+        self.month = month
+        self.year = year
+        self.count = count
 
 
 class TimeParameter:
     def __init__(self, year, month, day, duration):
-        self.month=month
-        self.year=year
-        self.day=day
-        self.duration=duration
-        
+        self.month = month
+        self.year = year
+        self.day = day
+        self.duration = duration
+
 
-def getMedian(data, sort_key, getEntireTuple = False):
+def getMedian(data, sort_key, getEntireTuple=False):
     if len(data) == 0:
         if getEntireTuple:
-            return MonthYearCount(0,0,0)
+            return MonthYearCount(0, 0, 0)
         return 0
 
     if len(data) == 1:
@@ -27,20 +28,20 @@ def getMedian(data, sort_key, getEntireTuple = False):
             return data[0]
         return data[0].count
 
-    srtd = sorted(data,key=sort_key)
-    mid = int(len(data)/2)
+    srtd = sorted(data, key=sort_key)
+    mid = int(len(data) / 2)
 
     if len(data) % 2 == 0:
         if getEntireTuple:
             return srtd[mid]
-        return (sort_key(srtd[mid-1]) + sort_key(srtd[mid])) / 2.0
+        return (sort_key(srtd[mid - 1]) + sort_key(srtd[mid])) / 2.0
 
     if getEntireTuple:
         return srtd[mid]
     return sort_key(srtd[mid])
 
 
-def computeTimeMedians(factors, lastmonthcount = LAST_MONTHS):
+def computeTimeMedians(factors, lastmonthcount=LAST_MONTHS):
     mediantimes = []
     lastmonths = []
     firstmonths = []
@@ -48,48 +49,47 @@ def computeTimeMedians(factors, lastmonthcount = LAST_MONTHS):
         values.sort(key=lambda myc: (myc.year, myc.month))
 
         l = len(values)
-        lastmonthsum = sum(myc.count for myc in values[max(l-lastmonthcount,0):l])
+        lastmonthsum = sum(myc.count for myc in values[max(l - lastmonthcount, 0):l])
         lastmonths.append(lastmonthsum)
-        cutoff_max = l-lastmonthcount
+        cutoff_max = l - lastmonthcount
         if cutoff_max < 0:
             cutoff_max = l
         firstmonthsum = sum(myc.count for myc in values[0:cutoff_max])
         firstmonths.append(firstmonthsum)
-        mediantimes.append(getMedian(values,lambda myc: myc.count))
+        mediantimes.append(getMedian(values, lambda myc: myc.count))
 
     median = getMedian(mediantimes, lambda x: x)
     medianLastMonth = getMedian(lastmonths, lambda x: x)
     medianFirstMonth = getMedian(firstmonths, lambda x: x)
 
     return medianFirstMonth, medianLastMonth, median
-        
+
 
 def getTimeParamsWithMedian(factors, medianFirstMonth, medianLastMonth, median):
     # strategy: find the median of the given distribution, then increase the time interval until it matches the given parameter
     res = []
     for values in factors:
-        input = sorted(values,key=lambda myc: (myc.year, myc.month))
-        currentMedian = getMedian(values,lambda myc: myc.count, True)
+        input = sorted(values, key=lambda myc: (myc.year, myc.month))
+        currentMedian = getMedian(values, lambda myc: myc.count, True)
         if int(median) == 0 or int(currentMedian.count) == 0 or int(currentMedian.year) == 0:
-            res.append(TimeParameter(START_YEAR,1,1,0))
+            res.append(TimeParameter(START_YEAR, 1, 1, 0))
             continue
         if currentMedian.count > median:
-            duration = int(28*currentMedian.count/median)
+            duration = int(28 * currentMedian.count / median)
             res.append(TimeParameter(currentMedian.year, currentMedian.month, 1, duration))
         else:
-            duration = int(28*median/currentMedian.count)
+            duration = int(28 * median / currentMedian.count)
             res.append(TimeParameter(currentMedian.year, currentMedian.month, 1, duration))
     return res
-        
+
 
 def findTimeParameters(factors):
-    
     medianFirstMonth, medianLastMonth, median = computeTimeMedians(factors)
     timeParams = getTimeParamsWithMedian(factors, medianFirstMonth, medianLastMonth, median)
 
     return timeParams
 
-    
+
 def findTimeParams(input_loan_list, time_bucket_df):
     time_list = [x for x in time_bucket_df.iloc[0].index.tolist()[1:]]
     factors = []
@@ -103,7 +103,5 @@ def findTimeParams(input_loan_list, time_bucket_df):
             year = START_YEAR + month / 12
             temp_factors.append(MonthYearCount(month % 12 + 1, int(year), count))
         factors.append(temp_factors)
-    
-    return findTimeParameters(factors)
-
 
+    return findTimeParameters(factors)