From 8bf0c7b437139f057775ad33f3a986349b87f8f8 Mon Sep 17 00:00:00 2001 From: HenningRose Date: Mon, 13 Jan 2025 18:44:12 +0100 Subject: [PATCH] Added explanatory Codebook --- .../example_generate_showers.ipynb | 100 +++++++++++- ...ple_tokenize_and_reconstruct_showers.ipynb | 144 +++++++++++++++++- 2 files changed, 234 insertions(+), 10 deletions(-) diff --git a/examples/example_notebooks/example_generate_showers.ipynb b/examples/example_notebooks/example_generate_showers.ipynb index fc490a3..118b34d 100644 --- a/examples/example_notebooks/example_generate_showers.ipynb +++ b/examples/example_notebooks/example_generate_showers.ipynb @@ -8,12 +8,16 @@ "source": [ "%load_ext autoreload\n", "%autoreload 2\n", + "import os\n", "import sys\n", + "from pathlib import Path\n", "\n", "import awkward as ak\n", "import numpy as np\n", + "import vector\n", + "from omegaconf import OmegaConf\n", "\n", - "sys.path.append(\"/beegfs/desy/user/rosehenn/gabbro\")" + "sys.path.append(\"/data/dust/user/rosehenn/gabbro\")" ] }, { @@ -36,8 +40,7 @@ "# this checkpoint is the checkpoint from a backbone training with the nex-token-prediction head\n", "# make sure you have downloaded the checkpoint in advance\n", "# if not, run the script `checkpoints/download_checkpoints.sh`\n", - "ckpt_path = \"/beegfs/desy/user/rosehenn/gabbro_output/full_resolution/runs/2024-11-21_13-49-55_max-wng060_TerminativeCirculation/checkpoints/epoch_032_loss_4.10881.ckpt\"\n", - "\n", + "ckpt_path = \"/data/dust/user/rosehenn/gabbro_output/full_resolution/runs/2024-11-21_13-49-55_max-wng060_TerminativeCirculation/checkpoints/epoch_032_loss_4.10881.ckpt\"\n", "gen_model = BackboneNextTokenPredictionLightning.load_from_checkpoint(ckpt_path)\n", "gen_model.eval()" ] @@ -46,7 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Generating Jets" + "## Generating Showers" ] }, { @@ -55,13 +58,96 @@ "metadata": {}, "outputs": [], "source": [ - "# save_path = \"/beegfs/desy/user/birkjosc/testing/omnijet/generated_jets.parquet\"\n", - "generated_jets = gen_model.generate_n_jets_batched(\n", - " n_jets=2,\n", + "generated_showers = gen_model.generate_n_showers_batched(\n", + " n_showers=2,\n", " batch_size=2,\n", " # saveas=save_path, # use this option if you want to save the awkward array as a parquet file\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generated_showers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# --- Load the tokenizer model from checkpoint, and also get the feature_dict from the config ---\n", + "from gabbro.models.vqvae import VQVAELightning\n", + "\n", + "ckpt_path = \"/data/dust/user/rosehenn/gabbro_output/TokTrain/runs/2024-09-21_16-54-39_max-wng062_CerousLocknut/checkpoints/epoch_231_loss_0.17179.ckpt\"\n", + "\n", + "vqvae_model = VQVAELightning.load_from_checkpoint(ckpt_path)\n", + "vqvae_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = OmegaConf.load(Path(ckpt_path).parent.parent / \"config.yaml\")\n", + "pp_dict = OmegaConf.to_container(cfg.data.dataset_kwargs_common.feature_dict)\n", + "print(\"\\npp_dict:\")\n", + "for item in pp_dict:\n", + " print(item, pp_dict[item])\n", + "\n", + "# get the cuts from the pp_dict (since this leads to particles being removed during\n", + "# preprocessing/tokenization), thus we also have to remove them from the original jets\n", + "# when we compare the tokenized+reconstructed particles to the original ones)\n", + "pp_dict_cuts = {\n", + " feat_name: {\n", + " criterion: pp_dict[feat_name].get(criterion)\n", + " for criterion in [\"larger_than\", \"smaller_than\"]\n", + " }\n", + " for feat_name in pp_dict\n", + "}\n", + "\n", + "print(\"\\npp_dict_cuts:\")\n", + "for item in pp_dict_cuts:\n", + " print(item, pp_dict_cuts[item])\n", + "\n", + "print(\"\\nModel:\")\n", + "print(vqvae_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reconstruct the generated tokens to physical features\n", + "\n", + "# note that if you want to reconstruct tokens from the generative model, you'll have\n", + "# to remove the start token from the tokenized array, and subtract 1 from the tokens\n", + "# (since we chose the convention to use 0 as the start token, so the tokens from the\n", + "# generative model are shifted by 1 compared to the ones from the VQ-VAE)\n", + "showers_reconstructed = vqvae_model.reconstruct_ak_tokens(\n", + " tokens_ak=generated_showers[:, 1:] - 1,\n", + " pp_dict=pp_dict,\n", + " batch_size=512,\n", + " pad_length=128,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "showers_reconstructed" + ] } ], "metadata": { diff --git a/examples/example_notebooks/example_tokenize_and_reconstruct_showers.ipynb b/examples/example_notebooks/example_tokenize_and_reconstruct_showers.ipynb index 396228f..52482bf 100644 --- a/examples/example_notebooks/example_tokenize_and_reconstruct_showers.ipynb +++ b/examples/example_notebooks/example_tokenize_and_reconstruct_showers.ipynb @@ -8,13 +8,23 @@ "source": [ "%load_ext autoreload\n", "%autoreload 2\n", - "\n", + "import os\n", "import sys\n", + "from pathlib import Path\n", "\n", "import awkward as ak\n", "import numpy as np\n", + "import vector\n", + "from omegaconf import OmegaConf\n", "\n", - "sys.path.append(\"/beegfs/desy/user/rosehenn/gabbro\")" + "sys.path.append(\"/data/dust/user/rosehenn/gabbro\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenization with the VQ-VAE" ] }, { @@ -24,8 +34,136 @@ "outputs": [], "source": [ "# --- Load the tokenizer model from checkpoint, and also get the feature_dict from the config ---\n", + "from gabbro.models.vqvae import VQVAELightning\n", + "\n", + "ckpt_path = \"/data/dust/user/rosehenn/gabbro_output/TokTrain/runs/2024-09-21_16-54-39_max-wng062_CerousLocknut/checkpoints/epoch_231_loss_0.17179.ckpt\"\n", + "\n", + "vqvae_model = VQVAELightning.load_from_checkpoint(ckpt_path)\n", + "vqvae_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cfg = OmegaConf.load(Path(ckpt_path).parent.parent / \"config.yaml\")\n", + "pp_dict = OmegaConf.to_container(cfg.data.dataset_kwargs_common.feature_dict)\n", + "print(\"\\npp_dict:\")\n", + "for item in pp_dict:\n", + " print(item, pp_dict[item])\n", + "\n", + "# get the cuts from the pp_dict (since this leads to particles being removed during\n", + "# preprocessing/tokenization), thus we also have to remove them from the original jets\n", + "# when we compare the tokenized+reconstructed particles to the original ones)\n", + "pp_dict_cuts = {\n", + " feat_name: {\n", + " criterion: pp_dict[feat_name].get(criterion)\n", + " for criterion in [\"larger_than\", \"smaller_than\"]\n", + " }\n", + " for feat_name in pp_dict\n", + "}\n", + "\n", + "print(\"\\npp_dict_cuts:\")\n", + "for item in pp_dict_cuts:\n", + " print(item, pp_dict_cuts[item])\n", + "\n", + "print(\"\\nModel:\")\n", + "print(vqvae_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load shower file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gabbro.data.loading import read_shower_file\n", + "\n", + "filename_in = \"/data/dust/user/rosehenn/gabbro/notebooks/array_real.parquet\"\n", + "showers = ak.from_parquet(filename_in)\n", + "showers = showers[:5000]\n", + "# part_features_ak = ak_select_and_preprocess(data_showers, pp_dict_cuts)[:, :128]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenize and reconstruct showers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tokenization and reconstruction\n", + "\n", + "part_features_ak_tokenized = vqvae_model.tokenize_ak_array(\n", + " ak_arr=showers,\n", + " pp_dict=pp_dict,\n", + " batch_size=4,\n", + " pad_length=1700,\n", + ")\n", + "# note that if you want to reconstruct tokens from the generative model, you'll have\n", + "# to remove the start token from the tokenized array, and subtract 1 from the tokens\n", + "# (since we chose the convention to use 0 as the start token, so the tokens from the\n", + "# generative model are shifted by 1 compared to the ones from the VQ-VAE)\n", + "part_features_ak_reco = vqvae_model.reconstruct_ak_tokens(\n", + " tokens_ak=part_features_ak_tokenized,\n", + " pp_dict=pp_dict,\n", + " batch_size=4,\n", + " pad_length=1700,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# inspect the tokenized and reconstructed Showers\n", + "print(\"First 5 tokenized Showers:\")\n", + "for i in range(5):\n", + " print(part_features_ak_tokenized[i])\n", + "\n", + "print(\"\\nFirst 5 reconstructed Showers:\")\n", + "for i in range(5):\n", + " print(part_features_ak_reco[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot the reconstructed showers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gabbro.plotting.feature_plotting import plot_paper_plots\n", "\n", - "from gabbro.models.vqvae import VQVAELightning" + "fig = plot_paper_plots(\n", + " feature_sets=[showers[: len(part_features_ak_reco)], part_features_ak_reco],\n", + " labels=[\"Geant4\", \"Tokenized\"], # \"OmniJet-$\\\\alpha_C$\" \"BIB-AE\", \"L2L Flows\"\n", + " colors=[\"lightgrey\", \"#1a80bb\", \"#ea801c\", \"#4CAF50\", \"#1a80bb\"],\n", + ")\n", + "fig.show()" ] } ],