Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the training scripts to incorporate the latest versions of Transformers and other dependencies. #84

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion manga_ocr_dev/data/process_manga109s.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def export_crops():
data["split"] = "train"
data.loc[data.sample(len(data)).iloc[:n_test].index, "split"] = "test"

data["crop_path"] = str(crops_root) + "\\" + data.id + ".png"
data["crop_path"] = str(crops_root) + "/" + data.id + ".png"
kha-white marked this conversation as resolved.
Show resolved Hide resolved

data.page_path = data.page_path.apply(lambda x: "/".join(Path(x).parts[-4:]))
data.crop_path = data.crop_path.apply(lambda x: "/".join(Path(x).parts[-2:]))
Expand Down
8 changes: 4 additions & 4 deletions manga_ocr_dev/training/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def read_image(processor, path, transform=None):

img = transform(image=img)["image"]

pixel_values = processor(img, return_tensors="pt").pixel_values
pixel_values = processor.feature_extractor(img, return_tensors="pt").pixel_values
kha-white marked this conversation as resolved.
Show resolved Hide resolved
return pixel_values.squeeze()

@staticmethod
Expand All @@ -132,7 +132,7 @@ def get_transforms():
A.Sharpen(p=0.2),
A.RandomBrightnessContrast(p=0.5),
A.GaussNoise((50, 200), p=0.3),
A.ImageCompression(0, 30, p=0.1),
A.ImageCompression(1, 30, p=0.1),
kha-white marked this conversation as resolved.
Show resolved Hide resolved
A.ToGray(always_apply=True),
]
)
Expand All @@ -149,11 +149,11 @@ def get_transforms():
],
p=0.1,
),
A.Blur((4, 9), p=0.5),
A.Blur((3, 9), p=0.5),
kha-white marked this conversation as resolved.
Show resolved Hide resolved
A.Sharpen(p=0.5),
A.RandomBrightnessContrast(0.8, 0.8, p=1),
A.GaussNoise((1000, 10000), p=0.3),
A.ImageCompression(0, 10, p=0.5),
A.ImageCompression(1, 10, p=0.5),
kha-white marked this conversation as resolved.
Show resolved Hide resolved
A.ToGray(always_apply=True),
]
)
Expand Down
9 changes: 4 additions & 5 deletions manga_ocr_dev/training/get_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
AutoConfig,
AutoModelForCausalLM,
AutoModel,
TrOCRProcessor,
VisionEncoderDecoderModel,
AutoFeatureExtractor,
AutoImageProcessor,
kha-white marked this conversation as resolved.
Show resolved Hide resolved
AutoTokenizer,
VisionEncoderDecoderConfig,
)


class TrOCRProcessorCustom(TrOCRProcessor):
class TrOCRProcessorCustom:
kha-white marked this conversation as resolved.
Show resolved Hide resolved
"""The only point of this class is to bypass type checks of base class."""

def __init__(self, feature_extractor, tokenizer):
Expand All @@ -20,7 +19,7 @@ def __init__(self, feature_extractor, tokenizer):


def get_processor(encoder_name, decoder_name):
feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_name)
feature_extractor = AutoImageProcessor.from_pretrained(encoder_name, use_fast=True)
kha-white marked this conversation as resolved.
Show resolved Hide resolved
tokenizer = AutoTokenizer.from_pretrained(decoder_name)
processor = TrOCRProcessorCustom(feature_extractor, tokenizer)
return processor
Expand Down Expand Up @@ -48,7 +47,7 @@ def get_model(encoder_name, decoder_name, max_length, num_decoder_layers=None):

decoder_config.num_hidden_layers = num_decoder_layers

config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably this is a existing bug ?

Otherwise, the saved config file and model will be different, i.e.: in the config for decoder, there are more layers than the model. When load the model for inference, those layers will be randomly initialized.

config.tie_word_embeddings = False
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder, config=config)

Expand Down
8 changes: 4 additions & 4 deletions manga_ocr_dev/training/metrics.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import numpy as np
from datasets import load_metric
import evaluate
kha-white marked this conversation as resolved.
Show resolved Hide resolved


class Metrics:
def __init__(self, processor):
self.cer_metric = load_metric("cer")
self.cer_metric = evaluate.load("cer")
kha-white marked this conversation as resolved.
Show resolved Hide resolved
self.processor = processor

def compute_metrics(self, pred):
label_ids = pred.label_ids
pred_ids = pred.predictions
print(label_ids.shape, pred_ids.shape)

pred_str = self.processor.batch_decode(pred_ids, skip_special_tokens=True)
pred_str = self.processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_ids[label_ids == -100] = self.processor.tokenizer.pad_token_id
label_str = self.processor.batch_decode(label_ids, skip_special_tokens=True)
label_str = self.processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
kha-white marked this conversation as resolved.
Show resolved Hide resolved

pred_str = np.array(["".join(text.split()) for text in pred_str])
label_str = np.array(["".join(text.split()) for text in label_str])
Expand Down
10 changes: 4 additions & 6 deletions manga_ocr_dev/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
def run(
run_name="debug",
encoder_name="facebook/deit-tiny-patch16-224",
decoder_name="cl-tohoku/bert-base-japanese-char-v2",
decoder_name="tohoku-nlp/bert-base-japanese-char-v2",
kha-white marked this conversation as resolved.
Show resolved Hide resolved
max_len=300,
num_decoder_layers=2,
batch_size=64,
num_epochs=8,
fp16=True,
):
wandb.login()

model, processor = get_model(encoder_name, decoder_name, max_len, num_decoder_layers)

Expand All @@ -30,7 +29,7 @@ def run(

training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
evaluation_strategy="steps",
eval_strategy="steps",
kha-white marked this conversation as resolved.
Show resolved Hide resolved
save_strategy="steps",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
Expand All @@ -39,6 +38,7 @@ def run(
dataloader_num_workers=16,
output_dir=TRAIN_ROOT,
logging_steps=10,
report_to="none",
kha-white marked this conversation as resolved.
Show resolved Hide resolved
save_steps=20000,
eval_steps=20000,
num_train_epochs=num_epochs,
Expand All @@ -48,7 +48,7 @@ def run(
# instantiate trainer
trainer = Seq2SeqTrainer(
model=model,
tokenizer=processor.feature_extractor,
processing_class=processor.feature_extractor,
kha-white marked this conversation as resolved.
Show resolved Hide resolved
args=training_args,
compute_metrics=metrics.compute_metrics,
train_dataset=train_dataset,
Expand All @@ -57,8 +57,6 @@ def run(
)
trainer.train()

wandb.finish()


if __name__ == "__main__":
fire.Fire(run)
Loading