import os import re import httpx from huggingface_hub import hf_hub_download, snapshot_download from transformers.testing_utils import _run_pipeline_tests, _run_staging from transformers.utils.import_utils import is_mistral_common_available URLS_FOR_TESTING_DATA = [ "http://images.cocodataset.org/val2017/000000000139.jpg", "http://images.cocodataset.org/val2017/000000000285.jpg", "http://images.cocodataset.org/val2017/000000000632.jpg", "http://images.cocodataset.org/val2017/000000000724.jpg", "http://images.cocodataset.org/val2017/000000000776.jpg", "http://images.cocodataset.org/val2017/000000000785.jpg", "http://images.cocodataset.org/val2017/000000000802.jpg", "http://images.cocodataset.org/val2017/000000000872.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg", "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg", "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3", "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png", "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4", "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png", "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg", "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav", "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3", "https://huggingface.co/datasets/raushan-testing-hf/images_test/resolve/main/picsum_237_200x300.jpg", "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4", "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4", "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png", "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg", "https://llava-vl.github.io/static/images/view.jpg", "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4", "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg", "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", ] def url_to_local_path(url, return_url_if_not_found=True): filename = url.split("/")[-1] if not os.path.exists(filename) and return_url_if_not_found: return url return filename def parse_hf_url(url): """ Parse a HuggingFace Hub URL into components for hf_hub_download. Returns dict with (repo_id, filename, repo_type, revision) or None if not a HF URL. """ pattern = r"https://huggingface\.co/(datasets/)?([^/]+/[^/]+)/resolve/([^/]+)/(.+)" match = re.match(pattern, url) if not match: return None is_dataset = match.group(1) is not None revision = match.group(3) return { "repo_id": match.group(2), "filename": match.group(4), "repo_type": "dataset" if is_dataset else "model", "revision": revision if revision != "main" else None, } def validate_downloaded_content(filepath): with open(filepath, "rb") as f: header = f.read(32) for bad_sig in [b" 0 # This will go the path `transformers/tokenization_mistral_common.py::MistralCommonBackend::from_pretrained --> mistral_common.tokens.tokenizers.utils.download_tokenizer_from_hf_hub`. # No idea at all why we need the statement below again (`MistralCommonBackend.from_pretrained`). AutoTokenizer.from_pretrained( repo_id, tokenizer_type="mistral", local_files_only=local_files_only, revision=None ) _ = MistralCommonBackend.from_pretrained( repo_id, local_files_only=local_files_only, # This is a hack as `list_local_hf_repo_files` from `mistral_common` has a bug # TODO: Discuss with `mistral-common` maintainers: after a fix being done there, remove this `revision` hack revision=None, ) MistralTokenizer.from_hf_hub(repo_id, local_files_only=local_files_only) repo_id = "mistralai/Voxtral-Mini-3B-2507" local_files_only = len(list_local_hf_repo_files(repo_id, revision=None)) > 0 AutoTokenizer.from_pretrained(repo_id, local_files_only=local_files_only, revision=None) MistralTokenizer.from_hf_hub(repo_id, local_files_only=local_files_only) # Download files from URLs to local directory for url in URLS_FOR_TESTING_DATA: download_test_file(url)