Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analysis of Pythia #34

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions evaluate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export HF_HOME="/model/i-sugiura"
PATH_TO_ANNOTATE_DIR="/model/kiyomaru/memorization-analysis/pythia/preprocess/annotate"
PATH_TO_RESULT_DIR="/model/i-sugiura/memorization-analysis/pythia/result6.9B"
MODEL_NAME_OR_PATH="EleutherAI/pythia-6.9b"
python3 src/evaluate.py --data_dir $PATH_TO_ANNOTATE_DIR --output_dir $PATH_TO_RESULT_DIR --model_name_or_path $MODEL_NAME_OR_PATH --batch_size 12
3 changes: 3 additions & 0 deletions plot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
PATH_TO_RESULT_DIR="/model/i-sugiura/memorization-analysis/pythia/result6.9B"
PATH_TO_PLOT_DIR="pythia/plot6.9B"
python3 src/plot.py --data_dir $PATH_TO_RESULT_DIR --output_dir $PATH_TO_PLOT_DIR
13 changes: 6 additions & 7 deletions src/elastic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from elastic_transport import ConnectionTimeout
from elasticsearch import Elasticsearch, helpers
from utils import FOLDS, LOCAL_RANKS, load_examples
from utils import load_examples

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,8 +46,9 @@ def parse_args() -> argparse.Namespace:
parser_index.add_argument(
"--data_dir",
type=str,
nargs="+",
required=True,
help="The directory containing data files.",
help="The directories containing data files.",
)
parser_index.add_argument(
"--num_workers",
Expand Down Expand Up @@ -202,10 +203,8 @@ def index(args: argparse.Namespace) -> None:
create_index(args.host, args.index)

paths = []
data_dir = Path(args.data_dir)
for fold in FOLDS:
for local_rank in LOCAL_RANKS:
paths.append(data_dir / f"used_data_{fold}" / f"used_data_{local_rank}.jsonl.gz")
for data_dir in args.data_dir:
paths.extend(list(Path(data_dir).glob("**/*.jsonl.gz")))

worker_fn = partial(index_documents, args.host, args.index)

Expand Down Expand Up @@ -253,7 +252,7 @@ def main(args: argparse.Namespace) -> None:
args = parse_args()

logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
level=logging.INFO if args.verbose else logging.WARNING,
format="%(asctime)s %(name)s:%(lineno)d: %(levelname)s: %(message)s",
)

Expand Down
10 changes: 8 additions & 2 deletions src/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,14 @@ def main(args: argparse.Namespace) -> None:
logger.info(f"Create output directory {args.output_dir}")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
path_list = list(data_dir.glob("**/*.jsonl.gz"))
path_list = sorted(path_list)

for path in data_dir.glob("**/*.jsonl.gz"):
for path in path_list:
output_file = output_dir / path.relative_to(data_dir)
if output_file.exists():
logger.info(f"Skip {path} because {output_file} already exists.")
continue
logger.info(f"Load examples from {path}.")
examples = [example for example in load_examples(path)]

Expand Down Expand Up @@ -113,7 +119,7 @@ def main(args: argparse.Namespace) -> None:
example.metrics[f"bleu/{prefix_length}"] = bleu_

logger.info("Save metrics.")
output_file = output_dir / path.relative_to(data_dir)
#output_file = output_dir / path.relative_to(data_dir)
output_file.parent.mkdir(parents=True, exist_ok=True)
save_examples(examples, output_file)
logger.info(f"Saved metrics to {output_file}.")
Expand Down
71 changes: 47 additions & 24 deletions src/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,25 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="Whether to print debug messages.",
)
parser.add_argument(
"--min_frequency",
type=int,
default=0,
help="The minimum frequency of the examples to plot.",
)
parser.add_argument(
"--max_frequency",
type=int,
default=999_999_999_999,
help="The maximum frequency of the examples to plot.",
)
parser.add_argument(
"--zmax",
type=float,
default=0,
help="The maximum value of the heatmap.",
)

return parser.parse_args()


Expand All @@ -54,6 +73,7 @@ def plot_verbatim_memorization_ratio(
min_frequency: int = 0,
max_frequency: int = 999_999_999_999,
least_num_examples_per_grid: int = 1,
heatmap_zmax: float = 1,
) -> go.Figure:
"""Plot the verbatim memorization ratio.

Expand Down Expand Up @@ -105,10 +125,11 @@ def plot_verbatim_memorization_ratio(
x=list(map(str, steps)),
y=list(map(str, PREFIX_LENGTHS)),
zmin=0.0,
zmax=np.nanmax((z_max, heatmap_zmax)),
)
)
fig.update_layout(
title="Verbatim memorization ratio over training steps",
title=f"Verbatim memorization ratio over training steps(frequency: {min_frequency} - {max_frequency})",
xaxis_title="Training steps",
yaxis_title="Sequence length",
)
Expand Down Expand Up @@ -177,7 +198,7 @@ def plot_approximate_memorization_ratio(
)
)
fig.update_layout(
title="Approximate memorization ratio over training steps",
title=f"Approximate memorization ratio over training steps(frequency: {min_frequency} - {max_frequency})",
xaxis_title="Training steps",
yaxis_title="Sequence length",
)
Expand All @@ -202,35 +223,37 @@ def main(args: argparse.Namespace) -> None:
fig = plot_verbatim_memorization_ratio(examples)
fig.write_image(path)
logger.info(f"Saved to {path}.")
for min_frequency, max_frequency in zip(FREQUENCY_BINS[:-1], FREQUENCY_BINS[1:]):
logger.info(f"Plot extractable with frequency in [{min_frequency}, {max_frequency}].")
path = output_dir / f"verbatim_memorization_ratio_{min_frequency}_{max_frequency}.png"
fig = plot_verbatim_memorization_ratio(
examples,
min_frequency=min_frequency,
max_frequency=max_frequency,
least_num_examples_per_grid=args.least_num_examples_per_grid,
)
fig.write_image(path)
logger.info(f"Saved to {path}.")
min_frequency = args.min_frequency
max_frequency = args.max_frequency

logger.info(f"Plot extractable with frequency in [{min_frequency}, {max_frequency}].")
path = output_dir / f"verbatim_memorization_ratio_{min_frequency}_{max_frequency}.png"
fig = plot_verbatim_memorization_ratio(
examples,
min_frequency=min_frequency,
max_frequency=max_frequency,
least_num_examples_per_grid=args.least_num_examples_per_grid,
heatmap_zmax=args.zmax,
)
fig.write_image(path)
logger.info(f"Saved to {path}.")

logger.info("Plot approximate memorization ratio.")
path = output_dir / "approximate_memorization_ratio.png"
fig = plot_approximate_memorization_ratio(examples)
fig.write_image(path)
logger.info(f"Saved to {path}.")
for min_frequency, max_frequency in zip(FREQUENCY_BINS[:-1], FREQUENCY_BINS[1:]):
logger.info(f"Plot bleu with frequency in [{min_frequency}, {max_frequency}].")
path = output_dir / f"approximate_memorization_ratio_{min_frequency}_{max_frequency}.png"
fig = plot_approximate_memorization_ratio(
examples,
min_frequency=min_frequency,
max_frequency=max_frequency,
least_num_examples_per_grid=args.least_num_examples_per_grid,
)
fig.write_image(path)
logger.info(f"Saved to {path}.")

logger.info(f"Plot bleu with frequency in [{min_frequency}, {max_frequency}].")
path = output_dir / f"approximate_memorization_ratio_{min_frequency}_{max_frequency}.png"
fig = plot_approximate_memorization_ratio(
examples,
min_frequency=min_frequency,
max_frequency=max_frequency,
least_num_examples_per_grid=args.least_num_examples_per_grid,
)
fig.write_image(path)
logger.info(f"Saved to {path}.")

if __name__ == "__main__":
args = parse_args()
Expand Down
56 changes: 15 additions & 41 deletions src/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from utils import (
COMPLETION_END_INDEX,
COMPLETION_START_INDEX,
FOLDS,
LOCAL_RANKS,
Example,
load_examples,
save_examples,
Expand Down Expand Up @@ -40,6 +38,7 @@ def parse_args() -> argparse.Namespace:
parser_extract.add_argument(
"--data_dir",
type=str,
nargs="+",
required=True,
help="The directory containing data files.",
)
Expand All @@ -55,13 +54,6 @@ def parse_args() -> argparse.Namespace:
default=5_000,
help="The interval between two steps to sample examples.",
)
parser_extract.add_argument(
"--folds",
nargs="+",
type=str,
required=False,
help="The folds to evaluate. If not specified, all folds will be evaluated.",
)
parser_extract.add_argument(
"--num_workers",
type=int,
Expand Down Expand Up @@ -95,12 +87,6 @@ def parse_args() -> argparse.Namespace:
default="memorization-analysis-dev",
help="The name of the Elasticsearch index.",
)
parser_annotate.add_argument(
"--model_name_or_path",
type=str,
default="llm-jp/llm-jp-1.3b-v1.0",
help="The model name or path for the language model.",
)
parser_annotate.add_argument(
"--num_workers",
type=int,
Expand Down Expand Up @@ -180,35 +166,23 @@ def get_span_stats(


def extract(args: argparse.Namespace) -> None:
logger.info(f"Load data from {args.data_dir}")
data_dir = Path(args.data_dir)

logger.info(f"Create output directory {args.output_dir}")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

if args.folds is None:
folds = FOLDS
else:
folds = args.folds

data_files = []
for fold in folds:
for local_rank in LOCAL_RANKS:
data_files.append(data_dir / f"used_data_{fold}" / f"used_data_{local_rank}.jsonl.gz")

logger.info("Extract examples.")
worker_fn = partial(extract_examples, interval=args.interval)
with ProcessPoolExecutor(max_workers=args.num_workers) as executor:
for data_file, examples in zip(
data_files,
executor.map(worker_fn, data_files),
):
logger.info("Save examples.")
output_file = output_dir / data_file.relative_to(data_dir)
output_file.parent.mkdir(parents=True, exist_ok=True)
save_examples(examples, output_file)
logger.info(f"Saved examples to {output_file}.")
for data_dir in args.data_dir:
logger.info(f"Load data from {data_dir}")
paths = list(Path(data_dir).glob("**/*.jsonl.gz"))

logger.info("Extract examples.")
worker_fn = partial(extract_examples, interval=args.interval)
with ProcessPoolExecutor(max_workers=args.num_workers) as executor:
for path, examples in zip(paths, executor.map(worker_fn, paths)):
logger.info("Save examples.")
output_file = output_dir / path.relative_to(data_dir.parent)
output_file.parent.mkdir(parents=True, exist_ok=True)
save_examples(examples, output_file)
logger.info(f"Saved examples to {output_file}.")


def annotate(args: argparse.Namespace) -> None:
Expand Down Expand Up @@ -253,7 +227,7 @@ def main(args: argparse.Namespace) -> None:
args = parse_args()

logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
level=logging.INFO if args.verbose else logging.WARNING,
format="%(asctime)s %(name)s:%(lineno)d: %(levelname)s: %(message)s",
)

Expand Down
Loading