Skip to content

Commit 5b0c01b

Browse files
authored
Final test data cache - inside CI docker images (#40689)
* run * build * build * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
1 parent 1f3cc93 commit 5b0c01b

8 files changed

+54
-15
lines changed

.circleci/create_circleci_config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,9 @@ def to_dict(self):
177177
"command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
178178
}
179179
},
180-
{"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}},
180+
# During the CircleCI docker images build time, we might already (or not) download the data.
181+
# If it's done already, the files are inside the directory `/test_data/`.
182+
{"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
181183
{"run": {
182184
"name": "Run tests",
183185
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}

docker/custom-tokenizers.dockerfile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ FROM python:3.9-slim
22
ENV PYTHONDONTWRITEBYTECODE=1
33
ARG REF=main
44
USER root
5-
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler git-lfs
5+
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler git-lfs curl
66
ENV UV_PYTHON=/usr/local/bin/python
77
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
88

@@ -15,12 +15,20 @@ RUN mv catch.hpp ../libs/
1515
RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
1616
RUN make install -j 10
1717

18+
WORKDIR /
1819

1920
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
2021
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
2122
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
2223
# spacy is not used so not tested. Causes to failures. TODO fix later
2324
RUN uv run python -m unidic download
25+
26+
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
27+
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
28+
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
29+
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
30+
31+
2432
RUN uv pip uninstall transformers
2533

2634
RUN apt-get clean && rm -rf /var/lib/apt/lists/*

docker/examples-torch.dockerfile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@ FROM python:3.9-slim
22
ENV PYTHONDONTWRITEBYTECODE=1
33
ARG REF=main
44
USER root
5-
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg
5+
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
66
ENV UV_PYTHON=/usr/local/bin/python
77
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
88
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
99
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
1010
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
11+
12+
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
13+
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
14+
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
15+
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
16+
17+
1118
RUN uv pip uninstall transformers
1219
RUN apt-get clean && rm -rf /var/lib/apt/lists/*

docker/exotic-models.dockerfile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ FROM python:3.9-slim
22
ENV PYTHONDONTWRITEBYTECODE=1
33
ARG REF=main
44
USER root
5-
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs
5+
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
66
ENV UV_PYTHON=/usr/local/bin/python
77
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
88
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
@@ -13,5 +13,12 @@ RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transform
1313
# RUN git clone https://github.com/facebookresearch/detectron2.git
1414
# RUN python3 -m pip install --no-cache-dir -e detectron2
1515
RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
16+
17+
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
18+
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
19+
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
20+
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
21+
22+
1623
RUN uv pip uninstall transformers
1724
RUN apt-get clean && rm -rf /var/lib/apt/lists/*

docker/pipeline-torch.dockerfile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,17 @@ FROM python:3.9-slim
22
ENV PYTHONDONTWRITEBYTECODE=1
33
ARG REF=main
44
USER root
5-
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
5+
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
66
ENV UV_PYTHON=/usr/local/bin/python
77
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
88
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
99
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
1010
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
11+
12+
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
13+
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
14+
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
15+
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
16+
17+
1118
RUN uv pip uninstall transformers

docker/torch-light.dockerfile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,16 @@ FROM python:3.9-slim
22
ENV PYTHONDONTWRITEBYTECODE=1
33
ARG REF=main
44
USER root
5-
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg
5+
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
66
ENV UV_PYTHON=/usr/local/bin/python
77
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
88
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
99
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
1010
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
11+
12+
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
13+
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
14+
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
15+
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
16+
1117
RUN uv pip uninstall transformers

tests/test_processing_common.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,9 +1212,11 @@ def test_apply_chat_template_video_frame_sampling(self):
12121212
messages[0][0]["content"][0] = {
12131213
"type": "video",
12141214
"url": [
1215-
"https://www.ilankelman.org/stopsigns/australia.jpg",
1216-
"https://www.ilankelman.org/stopsigns/australia.jpg",
1217-
],
1215+
url_to_local_path(
1216+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
1217+
)
1218+
]
1219+
* 2,
12181220
}
12191221
out_dict_with_video = processor.apply_chat_template(
12201222
messages,

utils/fetch_hub_objects_for_ci.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
3636
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
3737
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
38+
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
3839
]
3940

4041

@@ -59,12 +60,11 @@ def url_to_local_path(url, return_url_if_not_found=True):
5960
# `fatal: could not read Username for 'https://hub-ci.huggingface.co': Success`
6061
# But this repo. is never used in a test decorated by `is_staging_test`.
6162
if not _run_staging:
62-
# Used in as `tests/models/auto/test_modeling_auto.py::AutoModelTest::test_dynamic_saving_from_local_repo --> _ = Repository( ... )`
63-
# TODO: Remove this and the above test when `huggingface_hub v1.0` comes (where `Repository` will be removed).
64-
_ = Repository(
65-
local_dir="tiny-random-custom-architecture",
66-
clone_from="hf-internal-testing/tiny-random-custom-architecture",
67-
)
63+
if not os.path.isdir("tiny-random-custom-architecture"):
64+
_ = Repository(
65+
local_dir="tiny-random-custom-architecture",
66+
clone_from="hf-internal-testing/tiny-random-custom-architecture",
67+
)
6868

6969
# For `tests/test_tokenization_mistral_common.py:TestMistralCommonTokenizer`, which eventually calls
7070
# `mistral_common.tokens.tokenizers.utils.download_tokenizer_from_hf_hub` which (probably) doesn't have the cache.

0 commit comments

Comments
 (0)