Skip to content

Commit

Permalink
Cleanup run scripts (#10)
Browse files Browse the repository at this point in the history
* Add run scripts

* Remove wandb flag in run_scripts

* Add missing python command

* Remove SLURM

* Fix array name

* Fix wrong embeddings in clustering
  • Loading branch information
dobraczka authored Jul 18, 2024
1 parent ea3c0bf commit 5e35773
Show file tree
Hide file tree
Showing 16 changed files with 858 additions and 1 deletion.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,7 @@ For example if you used micromamba for installation:
```bash
micromamba run -n klinker-conda python experiment.py movie-graph-benchmark-dataset --graph-pair "tmdb-tvdb" relational-token-blocker
```

This would be similar to the steps described in the above usage section.

In order to precisely reproduce the results from the paper we provide (adapted) run scripts from our SLURM batch scripts in the `run_scripts` folder.
We recommend to `git checkout paper` to checkout out the tagged commit on which the experiments were run since future development does not aim to be backwards compatible with this state.
61 changes: 61 additions & 0 deletions run_scripts/non_relational/run_deepblocker_autoencoder.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
small_nneighbors=500
large_nneighbors=1000
iebsize="512"
hidden_dim="196"
learning_rate="0.004542"

myargs=(
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V2 deepblocker"
)

nnargs=()
for base in "${myargs[@]}"
do
if [[ $base =~ .*15K.* ]]
then
nnargs+=("$base --n-neighbors $small_nneighbors")
else
nnargs+=("$base --n-neighbors $large_nneighbors --block-builder-kwargs faisshnsw")
fi
done

sifembeddings="fasttext"
embeddings="gtr-t5-base"
multi_embeddings="LaBSE"
st_other_args="--encoder autoencoder --inner-encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize --batch-size 512 --hidden-dimension $hidden_dim --learning-rate $learning_rate --force True"
args=()
for base in "${nnargs[@]}"
do
if [[ $base =~ .*D_Y.* ]] || [[ $base =~ .*D_W.* ]]
then
args+=("$base $st_other_args --embeddings $embeddings")
else
args+=("$base $st_other_args --embeddings $multi_embeddings")
fi
done
sif_other_args="--encoder autoencoder --inner-encoder sifembeddingtokenized --hidden-dimension $hidden_dim --learning-rate $learning_rate --force True"
for base in "${nnargs[@]}"
do
args+=("$base $other_args --embeddings $sifembeddings")
done

curr_param=$(echo ${args[$1]})
echo $curr_param

micromamba run -n klinker-conda -r y python experiment.py $curr_param
65 changes: 65 additions & 0 deletions run_scripts/non_relational/run_deepblocker_ctt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
small_nneighbors=500
large_nneighbors=1000
iebsize="512"
hidden_dim="384"
learning_rate="0.0030405"
max_perturbation="0.408395"
pos_to_neg_ratio="1.55515"
synth_tuples_per_tuple="5"

myargs=(
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V2 deepblocker"
)

nnargs=()
for base in "${myargs[@]}"
do
if [[ $base =~ .*15K.* ]]
then
nnargs+=("$base --n-neighbors $small_nneighbors")
else
nnargs+=("$base --n-neighbors $large_nneighbors --block-builder-kwargs faisshnsw")
fi
done

embeddings="gtr-t5-base"
multi_embeddings="LaBSE"
st_other_args="--encoder crosstupletraining --inner-encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize --batch-size 512 --hidden-dimension $hidden_dim --learning-rate $learning_rate --force True --max-perturbation=$max_perturbation --pos-to-neg-ratio=$pos_to_neg_ratio --synth-tuples-per-tuple=$synth_tuples_per_tuple --block-builder-kwargs faisshnsw"
args=()
for base in "${nnargs[@]}"
do
if [[ $base =~ .*D_Y.* ]] || [[ $base =~ .*D_W.* ]]
then
args+=("$base $st_other_args --embeddings $embeddings")
else
args+=("$base $st_other_args --embeddings $multi_embeddings")
fi
done

sifembeddings="fasttext"
sif_other_args="--encoder crosstupletraining --inner-encoder sifembeddingtokenized --batch-size 512 --hidden-dimension $hidden_dim --learning-rate $learning_rate --force True --max-perturbation=$max_perturbation --pos-to-neg-ratio=$pos_to_neg_ratio --synth-tuples-per-tuple=$synth_tuples_per_tuple"
for base in "${nnargs[@]}"
do
args+=("$base $other_args --embeddings $sifembeddings")
done

curr_param=$(echo ${args[$1]})
echo $curr_param

micromamba run -n klinker-conda -r y python experiment.py $curr_param
67 changes: 67 additions & 0 deletions run_scripts/non_relational/run_deepblocker_hyb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
small_nneighbors=500
large_nneighbors=1000
iebsize="512"
# hidden_dim="384"
reduce_dim_to="192"
reduce_sample_perc=0.3
hidden_dim="96"
learning_rate="0.0030405"
max_perturbation="0.408395"
pos_to_neg_ratio="1.55515"
synth_tuples_per_tuple="5"

myargs=(
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V2 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V1 deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V2 deepblocker"
)

nnargs=()
for base in "${myargs[@]}"
do
if [[ $base =~ .*15K.* ]]
then
nnargs+=("$base --n-neighbors $small_nneighbors")
else
nnargs+=("$base --n-neighbors $large_nneighbors --block-builder-kwargs faisshnsw")
fi
done

sifembeddings="fasttext"
embeddings="gtr-t5-base"
multi_embeddings="LaBSE"
other_args="--encoder hybrid --batch-size 512 --hidden-dimension $hidden_dim --learning-rate $learning_rate --force True --max-perturbation=$max_perturbation --pos-to-neg-ratio=$pos_to_neg_ratio --synth-tuples-per-tuple=$synth_tuples_per_tuple"
args=()
for base in "${nnargs[@]}"
do
if [[ $base =~ .*D_Y.* ]] || [[ $base =~ .*D_W.* ]]
then
args+=("$base $other_args --embeddings $embeddings --inner-encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize")
else
args+=("$base $other_args --embeddings $multi_embeddings --inner-encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize")
fi
# args+=("$base $other_args --inner-encoder sifembeddingtokenized --embeddings $sifembeddings")
done
for base in "${nnargs[@]}"
do
args+=("$base $other_args --inner-encoder sifembeddingtokenized --embeddings $sifembeddings")
done

curr_param=$(echo ${args[$1]})
echo $curr_param

micromamba run -n klinker-conda -r y python experiment.py $curr_param
55 changes: 55 additions & 0 deletions run_scripts/non_relational/run_only_embeddings.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
small_nneighbors=500
large_nneighbors=1000
myargs=(
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V2 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V1 only-embeddings-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V2 only-embeddings-blocker"
)

nnargs=()
for base in "${myargs[@]}"
do
if [[ $base =~ .*15K.* ]]
then
nnargs+=("$base --n-neighbors $small_nneighbors")
else
nnargs+=("$base --n-neighbors $large_nneighbors --block-builder-kwargs faisshnsw")
fi
done

sifembeddings="fasttext"
embeddings="gtr-t5-base"
multi_embeddings="LaBSE"
args=()
for base in "${nnargs[@]}"
do
if [[ $base =~ .*D_Y.* ]] || [[ $base =~ .*D_W.* ]]
then
args+=("$base --embeddings $embeddings --encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize")
else
args+=("$base --embeddings $multi_embeddings --encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize")
fi
done
for base in "${nnargs[@]}"
do
args+=("$base --encoder sifembeddingtokenized --embeddings $sifembeddings")
done

curr_param=$(echo ${args[$1]})
echo $curr_param

micromamba run -n klinker-conda -r y python experiment.py $curr_param
24 changes: 24 additions & 0 deletions run_scripts/non_relational/run_token.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
args=(
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V2 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V1 token-blocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V2 token-blocker"
)

curr_param=$(echo ${args[$1]})
echo $curr_param

micromamba run -n klinker-conda -r y python experiment.py $curr_param
61 changes: 61 additions & 0 deletions run_scripts/relational/run_deepblocker_autoencoder.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
small_nneighbors=250
large_nneighbors=500
iebsize="512"
hidden_dim="196"
learning_rate="0.004542"

myargs=(
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 15K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 15K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 15K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 15K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_W --size 100K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair D_Y --size 100K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_DE --size 100K --version V2 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V1 relational-deepblocker"
"--random-seed 42 open-ea-dataset --graph-pair EN_FR --size 100K --version V2 relational-deepblocker"
)

other_args="--encoder autoencoder --batch-size 512 --hidden-dimension $hidden_dim --learning-rate $learning_rate --force True"
args=()
st_args="--inner-encoder sentencetransformertokenized --inner-encoder-batch-size $iebsize"
embeddings="gtr-t5-base"
multi_embeddings="LaBSE"
for base in "${myargs[@]}"
do
if [[ $base =~ .*D_Y.* ]] || [[ $base =~ .*D_W.* ]]
then
args+=("$base $other_args $st_other_args --embeddings $embeddings")
else
args+=("$base $other_args $st_other_args --embeddings $multi_embeddings")
fi
if [[ $base =~ .*15K.* ]]
then
nnargs+=("$base --n-neighbors $small_nneighbors --rel-n-neighbors $small_nneighbors")
else
nnargs+=("$base --n-neighbors $large_nneighbors --rel-n-neighbors $large_nneighbors --block-builder-kwargs faisshnsw --reduce-dim-to $reduce_dim_to --reduce-sample-perc $reduce_sample_perc")
fi
done
sif_other_args="--inner-encoder sifembeddingtokenized"
for base in "${nnargs[@]}"
do
if [[ $base =~ .*15K.* ]]
then
nnargs+=("$base $other_args $sif_other_args --embeddings fasttext --n-neighbors $small_nneighbors --rel-n-neighbors $small_nneighbors")
else
nnargs+=("$base $other_args $sif_other_args --embeddings 100wiki.en.bin --n-neighbors $small_nneighbors --rel-n-neighbors $small_nneighbors --block-builder-kwargs faisshnsw")
fi
done

curr_param=$(echo ${args[$1]})
echo $curr_param

micromamba run -n klinker-conda -r y python experiment.py $curr_param
Loading

0 comments on commit 5e35773

Please sign in to comment.