Skip to content

Commit

Permalink
[feat] upgrade torchrec to 1.0.0 (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
tiankongdeguiji authored Oct 29, 2024
1 parent 55d2874 commit 0d34ca6
Show file tree
Hide file tree
Showing 14 changed files with 74 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/codestyle_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
ci-test:
runs-on: tzrec-codestyle-runner
container:
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.5
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6
steps:
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytyping_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
ci-test:
runs-on: tzrec-codestyle-runner
container:
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.5
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6
steps:
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unittest_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
ci-test:
runs-on: tzrec-runner
container:
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.5
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6
options: --gpus all --ipc host
steps:
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ repos:
files: \.py$
args: ["--license-filepath", "data/.license_header.txt", "--use-current-year"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.10
rev: v0.7.1
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-yaml
Expand All @@ -29,7 +29,7 @@ repos:
- id: codespell
args: ["--skip", "*.json"]
- repo: https://github.com/executablebooks/mdformat
rev: 0.7.17
rev: 0.7.18
hooks:
- id: mdformat
additional_dependencies:
Expand Down
38 changes: 30 additions & 8 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,44 @@
FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel
FROM ubuntu:22.04

RUN sed -i "s@http://archive.ubuntu.com@http://mirrors.aliyun.com@g" /etc/apt/sources.list && \
sed -i "s@http://security.ubuntu.com@http://mirrors.aliyun.com@g" /etc/apt/sources.list && \
sed -i "s@http://ports.ubuntu.com@http://mirrors.aliyun.com@g" /etc/apt/sources.list && \
apt-get update && \
apt-get upgrade -y && \
apt-get install -y git vim watchman wget
apt-get install -y --no-install-recommends \
build-essential ca-certificates \
ccache cmake gcc git vim watchman wget curl && \
rm -rf /var/lib/apt/lists/*

RUN wget https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/libidn11_1.33-2.2ubuntu2_amd64.deb && \
apt-get install ./libidn11_1.33-2.2ubuntu2_amd64.deb
apt-get install ./libidn11_1.33-2.2ubuntu2_amd64.deb && rm libidn11_1.33-2.2ubuntu2_amd64.deb

ADD pip.conf /root/.config/pip/pip.conf
RUN curl -fsSL -v -o ~/miniconda.sh -O "https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/Miniforge3-Linux-x86_64.sh" && \
chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda update -y -n base -c defaults conda && \
/opt/conda/bin/conda install -y python=3.11 && \
/opt/conda/bin/conda clean -ya
ENV PATH /opt/conda/bin:$PATH

RUN pip install fbgemm-gpu==0.8.0 --index-url https://download.pytorch.org/whl/cu121 && \
pip install torchmetrics==1.0.3 && \
pip install torchrec==0.8.0 --index-url https://download.pytorch.org/whl/cu121 && \
pip install torch_tensorrt==2.4.0
ARG DEVICE
RUN case ${DEVICE} in \
"cu121") pip install torch==2.5.0 fbgemm-gpu==1.0.0 --index-url https://download.pytorch.org/whl/cu121 && \
pip install torchmetrics==1.0.3 torch_tensorrt==2.5.0 && \
pip install torchrec==1.0.0 --index-url https://download.pytorch.org/whl/cu121 ;; \
* ) pip install torch==2.5.0 fbgemm-gpu==1.0.0 --index-url https://download.pytorch.org/whl/cpu && \
pip install torchmetrics==1.0.3 && \
pip install torchrec==1.0.0 --index-url https://download.pytorch.org/whl/cpu ;; \
esac && \
/opt/conda/bin/conda clean -ya

ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

ADD requirements.txt /root/requirements.txt
ADD requirements /root/requirements
RUN cd /root && pip install -r requirements.txt
RUN cd /root && pip install -r requirements.txt && rm requirements.txt
15 changes: 12 additions & 3 deletions docs/source/quick_start/local_tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ pip index versions tzrec -f http://tzrec.oss-cn-beijing.aliyuncs.com/release/nig
```bash
conda create -n tzrec python=3.11
conda activate tzrec
pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121
pip install fbgemm-gpu==0.8.0 --index-url https://download.pytorch.org/whl/cu121
pip install torch==2.5.0 --index-url https://download.pytorch.org/whl/cu121
pip install fbgemm-gpu==1.0.0 --index-url https://download.pytorch.org/whl/cu121
pip install torchmetrics==1.0.3
pip install torchrec==0.8.0 --index-url https://download.pytorch.org/whl/cu121
pip install torchrec==1.0.0 --index-url https://download.pytorch.org/whl/cu121
pip install tzrec==${TZREC_NIGHTLY_VERSION} -f http://tzrec.oss-cn-beijing.aliyuncs.com/release/nightly/repo.html --trusted-host tzrec.oss-cn-beijing.aliyuncs.com
```

Expand All @@ -30,6 +30,15 @@ docker exec -it <CONTAINER_ID> bash
pip install tzrec==${TZREC_NIGHTLY_VERSION} -f http://tzrec.oss-cn-beijing.aliyuncs.com/release/nightly/repo.html --trusted-host tzrec.oss-cn-beijing.aliyuncs.com
```

注:

```
GPU版本(CUDA 12.1) 镜像地址:
mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:${TZREC_DOCKER_VERSION}-cu121
CPU版本 镜像地址:
mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:${TZREC_DOCKER_VERSION}-cpu
```

## 前置准备

### 数据
Expand Down
6 changes: 3 additions & 3 deletions requirements/runtime.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ alibabacloud_credentials
anytree
common_io @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/common_io-0.4.1%2Btunnel-py2.py3-none-any.whl
faiss-cpu
fbgemm-gpu==0.8.0
fbgemm-gpu==1.0.0
graphlearn @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/graphlearn-1.3.0-cp311-cp311-linux_x86_64.whl ; python_version=="3.11"
graphlearn @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/graphlearn-1.3.0-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
grpcio-tools<1.63.0
Expand All @@ -11,6 +11,6 @@ pyfg @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-0.3.2-cp311-cp
pyfg @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-0.3.2-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
scikit-learn
tensorboard
torch==2.4.0
torch==2.5.0
torchmetrics==1.0.3
torchrec==0.8.0
torchrec==1.0.0
14 changes: 10 additions & 4 deletions scripts/build_docker.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
#!/usr/bin/env bash

REGISTRY=mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec
DOCKER_TAG=0.5
DOCKER_TAG=0.6

cp requirements.txt docker/
rm -rf docker/requirements
cp -r requirements/ docker/requirements
cd docker

docker build -t ${REGISTRY}/tzrec-devel:latest .
docker images -q ${REGISTRY}/tzrec-devel:latest | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:${DOCKER_TAG}
docker push ${REGISTRY}/tzrec-devel:latest
for DEVICE in cu121 cpu
do
docker build --network host -t ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-${DEVICE} --build-arg DEVICE=${DEVICE} .
docker push ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-${DEVICE}
done

docker images -q ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-cu121 | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:${DOCKER_TAG}
docker images -q ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-cu121 | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:latest
docker push ${REGISTRY}/tzrec-devel:${DOCKER_TAG}
docker push ${REGISTRY}/tzrec-devel:latest
2 changes: 1 addition & 1 deletion scripts/ci_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
pip install -r requirements.txt
bash scripts/gen_proto.sh

MKL_THREADING_LAYER=GNU PYTHONPATH=. python tzrec/tests/run.py
MKL_THREADING_LAYER=GNU TORCH_DEVICE_BACKEND_AUTOLOAD=0 PYTHONPATH=. python tzrec/tests/run.py
2 changes: 2 additions & 0 deletions tzrec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import os as _os

import torch as _torch # NOQA

if "OMP_NUM_THREADS" not in _os.environ:
_os.environ["OMP_NUM_THREADS"] = "1"

Expand Down
2 changes: 1 addition & 1 deletion tzrec/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def to(self, device: torch.device, non_blocking: bool = False) -> "Batch":
batch_size=self.batch_size,
)

def record_stream(self, stream: torch.cuda.streams.Stream) -> None:
def record_stream(self, stream: torch.Stream) -> None:
"""Record which streams have used the tensor."""
for v in self.dense_features.values():
v.record_stream(stream)
Expand Down
14 changes: 7 additions & 7 deletions tzrec/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ def build_mock_input_with_fg(
inputs = defaultdict(dict)
single_id_fields = {user_id, item_id}
for feature in features:
if type(feature) == IdFeature:
if type(feature) is IdFeature:
is_multi = (
random.random() < 0.5 and feature.inputs[0] not in single_id_fields
)
Expand All @@ -594,22 +594,22 @@ def build_mock_input_with_fg(
vocab_list=feature.config.vocab_list,
multival_sep=chr(29),
)
elif type(feature) == RawFeature:
elif type(feature) is RawFeature:
side, name = feature.side_inputs[0]
inputs[side][name] = RawMockInput(
name,
value_dim=feature.config.value_dim,
multival_sep=chr(29),
)
elif type(feature) == ComboFeature:
elif type(feature) is ComboFeature:
for side, input_name in feature.side_inputs:
if input_name in inputs[side]:
continue
is_multi = random.random() < 0.5 and input_name not in single_id_fields
inputs[side][input_name] = IdMockInput(
input_name, is_multi=is_multi, multival_sep=chr(29)
)
elif type(feature) == LookupFeature:
elif type(feature) is LookupFeature:
for i, (side, input_name) in enumerate(feature.side_inputs):
if input_name in inputs[side]:
continue
Expand All @@ -627,7 +627,7 @@ def build_mock_input_with_fg(
inputs[side][input_name] = IdMockInput(
input_name, is_multi=is_multi, multival_sep=chr(29)
)
elif type(feature) == MatchFeature:
elif type(feature) is MatchFeature:
for i, (side, input_name) in enumerate(feature.side_inputs):
if input_name in inputs[side]:
continue
Expand All @@ -637,14 +637,14 @@ def build_mock_input_with_fg(
inputs[side][input_name] = IdMockInput(
input_name, multival_sep=chr(29)
)
elif type(feature) == ExprFeature:
elif type(feature) is ExprFeature:
for side, input_name in feature.side_inputs:
if input_name in inputs[side]:
continue
inputs[side][input_name] = RawMockInput(
input_name, multival_sep=chr(29)
)
elif type(feature) == TokenizeFeature:
elif type(feature) is TokenizeFeature:
side, name = feature.side_inputs[0]
inputs[side][name] = IdMockInput(
name,
Expand Down
2 changes: 1 addition & 1 deletion tzrec/utils/config_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def edit_config(pipeline_config: Message, edit_config_json: Dict[str, Any]) -> M

# pyre-ignore [2, 3]
def _type_convert(proto, val, parent=None):
if type(val) != type(proto):
if type(val) is not type(proto):
try:
if isinstance(proto, bool):
assert val in ["True", "true", "False", "false"]
Expand Down
2 changes: 1 addition & 1 deletion tzrec/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.5.8"
__version__ = "0.6.0"

0 comments on commit 0d34ca6

Please sign in to comment.