-
Notifications
You must be signed in to change notification settings - Fork 14
Add file integrity checking for TimesFM model downloads #676
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
e720fc4
d807653
e1c29c1
f3800f2
39ea210
073898c
360f66b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,10 +1,14 @@ | ||||||||||||||||||||||||||
| import os | ||||||||||||||||||||||||||
| import warnings | ||||||||||||||||||||||||||
| from typing import Dict | ||||||||||||||||||||||||||
| from typing import List | ||||||||||||||||||||||||||
| from typing import Optional | ||||||||||||||||||||||||||
| from urllib import request | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| import numpy as np | ||||||||||||||||||||||||||
| from sklearn.base import ClassifierMixin | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| from etna.core.utils import get_known_hash, verify_file_hash | ||||||||||||||||||||||||||
| from etna.datasets import TSDataset | ||||||||||||||||||||||||||
| from etna.experimental.classification.classification import TimeSeriesBinaryClassifier | ||||||||||||||||||||||||||
| from etna.experimental.classification.feature_extraction.base import BaseTimeSeriesFeatureExtractor | ||||||||||||||||||||||||||
|
|
@@ -98,7 +102,34 @@ def download_model(model_name: str, dataset_freq: str, path: str): | |||||||||||||||||||||||||
| If the model does not exist in s3. | ||||||||||||||||||||||||||
| """ | ||||||||||||||||||||||||||
| url = f"http://etna-github-prod.cdn-tinkoff.ru/series_classification/22_11_2022/{dataset_freq}/{model_name}.pickle" | ||||||||||||||||||||||||||
| expected_hash = get_known_hash(url) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| # Check if file exists and verify integrity | ||||||||||||||||||||||||||
| if os.path.exists(path): | ||||||||||||||||||||||||||
| if verify_file_hash(path, expected_hash): | ||||||||||||||||||||||||||
| return # File exists and is valid (or no hash to check) | ||||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||||
| # File exists but hash doesn't match, re-download | ||||||||||||||||||||||||||
| if expected_hash is not None: | ||||||||||||||||||||||||||
| warnings.warn( | ||||||||||||||||||||||||||
| f"Local model file hash does not match expected hash. " | ||||||||||||||||||||||||||
| f"This may indicate a corrupted download. Re-downloading {model_name} from {url}" | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
| os.remove(path) | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| # Download the file | ||||||||||||||||||||||||||
| try: | ||||||||||||||||||||||||||
| request.urlretrieve(url=url, filename=path) | ||||||||||||||||||||||||||
| except Exception: | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| # Verify the downloaded file | ||||||||||||||||||||||||||
| if not verify_file_hash(path, expected_hash): | ||||||||||||||||||||||||||
| if expected_hash is not None: | ||||||||||||||||||||||||||
| os.remove(path) | ||||||||||||||||||||||||||
| raise RuntimeError( | ||||||||||||||||||||||||||
| f"Downloaded model file {model_name} from {url} failed integrity check. " | ||||||||||||||||||||||||||
| f"This may indicate a network issue or corrupted download." | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
| except Exception as e: | ||||||||||||||||||||||||||
| if expected_hash is not None and "integrity check" in str(e): | ||||||||||||||||||||||||||
|
Comment on lines
+128
to
+133
|
||||||||||||||||||||||||||
| raise RuntimeError( | |
| f"Downloaded model file {model_name} from {url} failed integrity check. " | |
| f"This may indicate a network issue or corrupted download." | |
| ) | |
| except Exception as e: | |
| if expected_hash is not None and "integrity check" in str(e): | |
| raise IntegrityCheckError( | |
| f"Downloaded model file {model_name} from {url} failed integrity check. " | |
| f"This may indicate a network issue or corrupted download." | |
| ) | |
| except Exception as e: | |
| if expected_hash is not None and isinstance(e, IntegrityCheckError): |
Copilot
AI
Jul 16, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
String matching on exception messages is fragile and error-prone. Consider catching specific exception types or using a custom exception class for integrity check failures instead of parsing the exception message.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -5,13 +5,15 @@ | |||||
| from pathlib import Path | ||||||
| from typing import Dict | ||||||
| from typing import List | ||||||
| from typing import Optional | ||||||
| from typing import Sequence | ||||||
| from typing import Union | ||||||
| from urllib import request | ||||||
|
|
||||||
| import pandas as pd | ||||||
|
|
||||||
| from etna import SETTINGS | ||||||
| from etna.core.utils import get_known_hash, verify_file_hash | ||||||
| from etna.datasets import TSDataset | ||||||
| from etna.distributions import BaseDistribution | ||||||
| from etna.models.base import PredictionIntervalContextRequiredAbstractModel | ||||||
|
|
@@ -84,18 +86,55 @@ def _is_url(self): | |||||
| return self.path_or_url.startswith("https://") or self.path_or_url.startswith("http://") | ||||||
|
|
||||||
| def _download_model_from_url(self) -> str: | ||||||
| """Download model from url to local cache_dir.""" | ||||||
| """Download model from url to local cache_dir with integrity verification.""" | ||||||
| model_file = self.path_or_url.split("/")[-1] | ||||||
| model_dir = model_file.split(".zip")[0] | ||||||
| full_model_path = f"{self.cache_dir}/{model_dir}" | ||||||
| if not os.path.exists(full_model_path): | ||||||
| try: | ||||||
| request.urlretrieve(url=self.path_or_url, filename=model_file) | ||||||
|
|
||||||
| with zipfile.ZipFile(model_file, "r") as zip_ref: | ||||||
| zip_ref.extractall(self.cache_dir) | ||||||
| finally: | ||||||
| os.remove(model_file) | ||||||
| zip_file_path = f"{self.cache_dir}/{model_file}" | ||||||
| expected_hash = get_known_hash(self.path_or_url) | ||||||
|
|
||||||
| # Check if extracted model directory exists and verify ZIP file integrity if it still exists | ||||||
| if os.path.exists(full_model_path): | ||||||
| if os.path.exists(zip_file_path): | ||||||
| if verify_file_hash(zip_file_path, expected_hash): | ||||||
| return full_model_path | ||||||
| else: | ||||||
| # ZIP file exists but hash doesn't match, re-download | ||||||
| if expected_hash is not None: | ||||||
| warnings.warn( | ||||||
| f"Local model ZIP file hash does not match expected hash. " | ||||||
| f"This may indicate a corrupted download. Re-downloading from {self.path_or_url}" | ||||||
| ) | ||||||
| # Remove both ZIP and extracted directory for clean re-download | ||||||
| os.remove(zip_file_path) | ||||||
| import shutil | ||||||
|
||||||
| import shutil |
Copilot
AI
Jul 16, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Import statements should be placed at the top of the file rather than within function bodies. Move import shutil to the import section at the beginning of the file.
| import shutil |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's not okay to add this here. Now utils in some sense depends on models.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed! Moved all file integrity utilities from
etna.core.utilstoetna.models.utilsto maintain proper separation of concerns. Core utilities now remain general-purpose and domain-agnostic, while model-specific integrity checking stays with the models where it belongs. Commit: 073898c