Skip to content

Commit

Permalink
BART text summarization workload (#117)
Browse files Browse the repository at this point in the history
* feat(txt-summarize): init bart-large-cnn workload to summarize text

* Fix(conflict): Makefile

* feat(workflow): text summarizer

* feat(workflow): add text summarization to config

* fix(bacalhau): remove hardcoded config from Bacalhau executor

* fix(config): detection-node for step merge-node does not exist

* chore(config): re-enable detection workflow

* feat(test): summarizer container test for invalid files
  • Loading branch information
enricorotundo authored May 2, 2023
1 parent 40e9046 commit b6cb278
Show file tree
Hide file tree
Showing 10 changed files with 2,029 additions and 3 deletions.
35 changes: 32 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -411,18 +411,47 @@ push-merge-image:
.


################################################################################
# Target: *-summarization-image
################################################################################

SUMMARIZATION_IMAGE ?= ghcr.io/bacalhau-project/amplify/summarization
SUMMARIZATION_TAG ?= ${TAG}
.PHONY: build-summarization-image
build-summarization-image:
docker build --progress=plain \
--tag ${SUMMARIZATION_IMAGE}:latest \
--file containers/summarization/Dockerfile \
.

.PHONY: test-summarization-image
test-summarization-image: build-summarization-image
bash containers/summarization/test.sh

.PHONY: push-summarization-image
push-summarization-image:
docker buildx build --push --progress=plain \
--platform linux/amd64,linux/arm64 \
--tag ${SUMMARIZATION_IMAGE}:${SUMMARIZATION_TAG} \
--tag ${SUMMARIZATION_IMAGE}:latest \
--label org.opencontainers.artifact.created=$(shell date -u +"%Y-%m-%dT%H:%M:%SZ") \
--label org.opencontainers.image.version=${SUMMARIZATION_TAG} \
--cache-from=type=registry,ref=${SUMMARIZATION_IMAGE}:latest \
--file containers/summarization/Dockerfile \
.

################################################################################
# Target: *-docker-images
################################################################################

.PHONY: build-docker-images
build-docker-images: build-amplify-image build-tika-image build-ffmpeg-image build-magick-image build-frictionless-image build-detection-image build-frictionless-extract-image
build-docker-images: build-amplify-image build-tika-image build-ffmpeg-image build-magick-image build-frictionless-image build-detection-image build-frictionless-extract-image build-summarization-image

.PHONY: test-docker-images
test-docker-images: test-tika-image test-ffmpeg-image test-magick-image test-frictionless-image test-detection-image test-frictionless-extract-image
test-docker-images: test-tika-image test-ffmpeg-image test-magick-image test-frictionless-image test-detection-image test-frictionless-extract-image test-summarization-image

.PHONY: push-docker-images
push-docker-images: push-amplify-image push-tika-image push-ffmpeg-image push-magick-image push-frictionless-image push-detection-image push-frictionless-extract-image
push-docker-images: push-amplify-image push-tika-image push-ffmpeg-image push-magick-image push-frictionless-image push-detection-image push-frictionless-extract-image push-summarization-image

# Release tarballs suitable for upload to GitHub release pages
################################################################################
Expand Down
20 changes: 20 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ jobs:
timeout: 10m
cpu: 2
memory: 8Gi
- id: summarization-job
type: bacalhau
image: ghcr.io/bacalhau-project/amplify/summarization:0.0.2
entrypoint:
- /usr/local/bin/run
timeout: 10m
cpu: 4
memory: 8Gi


# Amplify Work Graph specification
# Each item in the list is a node in the execution graph. A single request
Expand Down Expand Up @@ -118,6 +127,15 @@ graph:
path: /inputs
outputs:
- path: /outputs
- id: text-summarization-node
job_id: summarization-job
inputs:
- node_id: metadata-node
predicate: '.*text\/plain.*'
- node_id: root-node
path: /inputs/
outputs:
- path: /outputs
- id: merge-node
job_id: merge-job
inputs:
Expand All @@ -137,5 +155,7 @@ graph:
path: /inputs/csv-profile
- node_id: convert-table-to-csv-node
path: /inputs/converted-tables
- node_id: text-summarization-node
path: /inputs/summarized-text
outputs:
- path: /outputs
12 changes: 12 additions & 0 deletions containers/summarization/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM docker.io/huggingface/transformers-cpu:4.18.0

# bake model into container
RUN mkdir -p /models
RUN python3 -c 'from transformers import pipeline; pipeline("summarization", model="facebook/bart-large-cnn").save_pretrained("/models/bart-large-cnn")'

RUN mkdir -p /usr/local/bin
ENV PATH="/usr/local/bin:${PATH}"
COPY containers/scripts/run_program.sh /usr/local/bin/run_program.sh
ENTRYPOINT ["/usr/local/bin/run"] # This will get overwritten by Amplify
COPY containers/summarization/run /usr/local/bin/run
COPY containers/summarization/bart-summarize.py /usr/local/bin/bart-summarize.py
36 changes: 36 additions & 0 deletions containers/summarization/bart-summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import sys
import json
import pathlib

from transformers import pipeline


# read input file
with open(sys.argv[1], mode='r', errors='replace') as f:
text = f.read()
if len(text) <= 0:
raise Exception("File {} appears to be empty".format(sys.argv[1]))

# facebook/bart-large-cnn can take input up to 1024 char
if len(text) > 1024:
text = text[:1024]

# run inference
summarizer = pipeline("summarization", model="/models/bart-large-cnn")
summary_list = summarizer(text, max_length=130, min_length=30, do_sample=False)

# save JSON summary
if summary_list:
if isinstance(summary_list[0], dict) and ('summary_text' in summary_list[0].keys()):
json_object = json.dumps(summary_list[0], indent = None)

print(json_object)

output_file = os.path.join(sys.argv[2], pathlib.Path(sys.argv[1]).name + ".json")
print(output_file, file=sys.stderr)

with open(output_file, "w") as outfile:
outfile.write(json_object)
else:
raise Exception("Generated summary is empty.")
3 changes: 3 additions & 0 deletions containers/summarization/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

run_program.sh 'python3 /usr/local/bin/bart-summarize.py ${input_file} ${output_dir}' /inputs /outputs ;
52 changes: 52 additions & 0 deletions containers/summarization/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
IMAGE=ghcr.io/bacalhau-project/amplify/summarization:latest

echo ${SCRIPT_DIR}

# checkError ensures previous command succeeded
checkError() {
if [ $? -ne 0 ]; then
echo "Failed to run test, there was an error"
exit 1
fi
}

# checkFileExists ensures that a file exists
checkFileExists() {
if [ ! -f "$1" ]; then
echo "File $1 does not exist"
exit 1
fi
}

checkFileDoesNotExists() {
if [ -f "$1" ]; then
echo "File $1 does exist"
exit 1
fi
}

main() {
# Test blob
rm -rf $SCRIPT_DIR/outputs
docker run -it --rm -v $SCRIPT_DIR/../test/testdata/text_blob/somethoughts:/inputs -v $SCRIPT_DIR/outputs:/outputs --entrypoint "" $IMAGE run
checkError
checkFileExists "$SCRIPT_DIR/outputs/file.plain.json"

# Test subdir
rm -rf $SCRIPT_DIR/outputs
docker run -it --rm -v $SCRIPT_DIR/../test/testdata/text_dir:/inputs -v $SCRIPT_DIR/outputs:/outputs --entrypoint "" $IMAGE run
checkError
checkFileExists "$SCRIPT_DIR/outputs/subdir/codfish.txt.json"
checkFileExists "$SCRIPT_DIR/outputs/looneytunes.plain.json"
# Test empty file
checkFileDoesNotExists "$SCRIPT_DIR/outputs/empty.plain.json"

# Test non textual files
rm -rf $SCRIPT_DIR/outputs
docker run -it --rm -v $SCRIPT_DIR/../test/testdata/images:/inputs -v $SCRIPT_DIR/outputs:/outputs --entrypoint "" $IMAGE run
checkError
}

main
Loading

0 comments on commit b6cb278

Please sign in to comment.