-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BART text summarization workload (#117)
* feat(txt-summarize): init bart-large-cnn workload to summarize text * Fix(conflict): Makefile * feat(workflow): text summarizer * feat(workflow): add text summarization to config * fix(bacalhau): remove hardcoded config from Bacalhau executor * fix(config): detection-node for step merge-node does not exist * chore(config): re-enable detection workflow * feat(test): summarizer container test for invalid files
- Loading branch information
1 parent
40e9046
commit b6cb278
Showing
10 changed files
with
2,029 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
FROM docker.io/huggingface/transformers-cpu:4.18.0 | ||
|
||
# bake model into container | ||
RUN mkdir -p /models | ||
RUN python3 -c 'from transformers import pipeline; pipeline("summarization", model="facebook/bart-large-cnn").save_pretrained("/models/bart-large-cnn")' | ||
|
||
RUN mkdir -p /usr/local/bin | ||
ENV PATH="/usr/local/bin:${PATH}" | ||
COPY containers/scripts/run_program.sh /usr/local/bin/run_program.sh | ||
ENTRYPOINT ["/usr/local/bin/run"] # This will get overwritten by Amplify | ||
COPY containers/summarization/run /usr/local/bin/run | ||
COPY containers/summarization/bart-summarize.py /usr/local/bin/bart-summarize.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
import sys | ||
import json | ||
import pathlib | ||
|
||
from transformers import pipeline | ||
|
||
|
||
# read input file | ||
with open(sys.argv[1], mode='r', errors='replace') as f: | ||
text = f.read() | ||
if len(text) <= 0: | ||
raise Exception("File {} appears to be empty".format(sys.argv[1])) | ||
|
||
# facebook/bart-large-cnn can take input up to 1024 char | ||
if len(text) > 1024: | ||
text = text[:1024] | ||
|
||
# run inference | ||
summarizer = pipeline("summarization", model="/models/bart-large-cnn") | ||
summary_list = summarizer(text, max_length=130, min_length=30, do_sample=False) | ||
|
||
# save JSON summary | ||
if summary_list: | ||
if isinstance(summary_list[0], dict) and ('summary_text' in summary_list[0].keys()): | ||
json_object = json.dumps(summary_list[0], indent = None) | ||
|
||
print(json_object) | ||
|
||
output_file = os.path.join(sys.argv[2], pathlib.Path(sys.argv[1]).name + ".json") | ||
print(output_file, file=sys.stderr) | ||
|
||
with open(output_file, "w") as outfile: | ||
outfile.write(json_object) | ||
else: | ||
raise Exception("Generated summary is empty.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
|
||
run_program.sh 'python3 /usr/local/bin/bart-summarize.py ${input_file} ${output_dir}' /inputs /outputs ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/bin/bash | ||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
IMAGE=ghcr.io/bacalhau-project/amplify/summarization:latest | ||
|
||
echo ${SCRIPT_DIR} | ||
|
||
# checkError ensures previous command succeeded | ||
checkError() { | ||
if [ $? -ne 0 ]; then | ||
echo "Failed to run test, there was an error" | ||
exit 1 | ||
fi | ||
} | ||
|
||
# checkFileExists ensures that a file exists | ||
checkFileExists() { | ||
if [ ! -f "$1" ]; then | ||
echo "File $1 does not exist" | ||
exit 1 | ||
fi | ||
} | ||
|
||
checkFileDoesNotExists() { | ||
if [ -f "$1" ]; then | ||
echo "File $1 does exist" | ||
exit 1 | ||
fi | ||
} | ||
|
||
main() { | ||
# Test blob | ||
rm -rf $SCRIPT_DIR/outputs | ||
docker run -it --rm -v $SCRIPT_DIR/../test/testdata/text_blob/somethoughts:/inputs -v $SCRIPT_DIR/outputs:/outputs --entrypoint "" $IMAGE run | ||
checkError | ||
checkFileExists "$SCRIPT_DIR/outputs/file.plain.json" | ||
|
||
# Test subdir | ||
rm -rf $SCRIPT_DIR/outputs | ||
docker run -it --rm -v $SCRIPT_DIR/../test/testdata/text_dir:/inputs -v $SCRIPT_DIR/outputs:/outputs --entrypoint "" $IMAGE run | ||
checkError | ||
checkFileExists "$SCRIPT_DIR/outputs/subdir/codfish.txt.json" | ||
checkFileExists "$SCRIPT_DIR/outputs/looneytunes.plain.json" | ||
# Test empty file | ||
checkFileDoesNotExists "$SCRIPT_DIR/outputs/empty.plain.json" | ||
|
||
# Test non textual files | ||
rm -rf $SCRIPT_DIR/outputs | ||
docker run -it --rm -v $SCRIPT_DIR/../test/testdata/images:/inputs -v $SCRIPT_DIR/outputs:/outputs --entrypoint "" $IMAGE run | ||
checkError | ||
} | ||
|
||
main |
Oops, something went wrong.