Files
MyFSIO/app/storage.py

1828 lines
73 KiB
Python

from __future__ import annotations
import hashlib
import json
import os
import re
import shutil
import stat
import threading
import time
import unicodedata
import uuid
from collections import OrderedDict
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, BinaryIO, Dict, Generator, List, Optional
# Platform-specific file locking
if os.name == "nt":
import msvcrt
@contextmanager
def _file_lock(file_handle) -> Generator[None, None, None]:
"""Acquire an exclusive lock on a file (Windows)."""
try:
msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
yield
finally:
try:
file_handle.seek(0)
msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
except OSError:
pass
else:
import fcntl # type: ignore
@contextmanager
def _file_lock(file_handle) -> Generator[None, None, None]:
"""Acquire an exclusive lock on a file (Unix)."""
try:
fcntl.flock(file_handle.fileno(), fcntl.LOCK_EX)
yield
finally:
fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
WINDOWS_RESERVED_NAMES = {
"CON",
"PRN",
"AUX",
"NUL",
"COM1",
"COM2",
"COM3",
"COM4",
"COM5",
"COM6",
"COM7",
"COM8",
"COM9",
"LPT1",
"LPT2",
"LPT3",
"LPT4",
"LPT5",
"LPT6",
"LPT7",
"LPT8",
"LPT9",
}
class StorageError(RuntimeError):
"""Raised when the storage layer encounters an unrecoverable problem."""
class BucketNotFoundError(StorageError):
"""Raised when the bucket does not exist."""
class ObjectNotFoundError(StorageError):
"""Raised when the object does not exist."""
class QuotaExceededError(StorageError):
"""Raised when an operation would exceed bucket quota limits."""
def __init__(self, message: str, quota: Dict[str, Any], usage: Dict[str, int]):
super().__init__(message)
self.quota = quota
self.usage = usage
@dataclass
class ObjectMeta:
key: str
size: int
last_modified: datetime
etag: Optional[str] = None
metadata: Optional[Dict[str, str]] = None
@dataclass
class BucketMeta:
name: str
created_at: datetime
@dataclass
class ListObjectsResult:
"""Paginated result for object listing."""
objects: List[ObjectMeta]
is_truncated: bool
next_continuation_token: Optional[str]
total_count: Optional[int] = None
def _utcnow() -> datetime:
return datetime.now(timezone.utc)
def _utc_isoformat() -> str:
return _utcnow().isoformat().replace("+00:00", "Z")
class ObjectStorage:
"""Very small filesystem wrapper implementing the bare S3 primitives."""
INTERNAL_FOLDERS = {".meta", ".versions", ".multipart"}
SYSTEM_ROOT = ".myfsio.sys"
SYSTEM_BUCKETS_DIR = "buckets"
SYSTEM_MULTIPART_DIR = "multipart"
SYSTEM_TMP_DIR = "tmp"
BUCKET_META_DIR = "meta"
BUCKET_VERSIONS_DIR = "versions"
MULTIPART_MANIFEST = "manifest.json"
BUCKET_CONFIG_FILE = ".bucket.json"
DEFAULT_CACHE_TTL = 5
OBJECT_CACHE_MAX_SIZE = 100
def __init__(self, root: Path, cache_ttl: int = DEFAULT_CACHE_TTL) -> None:
self.root = Path(root)
self.root.mkdir(parents=True, exist_ok=True)
self._ensure_system_roots()
self._object_cache: OrderedDict[str, tuple[Dict[str, ObjectMeta], float]] = OrderedDict()
self._cache_lock = threading.Lock()
self._bucket_locks: Dict[str, threading.Lock] = {}
self._cache_version: Dict[str, int] = {}
self._bucket_config_cache: Dict[str, tuple[dict[str, Any], float]] = {}
self._bucket_config_cache_ttl = 30.0
self._cache_ttl = cache_ttl
def _get_bucket_lock(self, bucket_id: str) -> threading.Lock:
"""Get or create a lock for a specific bucket. Reduces global lock contention."""
with self._cache_lock:
if bucket_id not in self._bucket_locks:
self._bucket_locks[bucket_id] = threading.Lock()
return self._bucket_locks[bucket_id]
def list_buckets(self) -> List[BucketMeta]:
buckets: List[BucketMeta] = []
for bucket in sorted(self.root.iterdir()):
if bucket.is_dir() and bucket.name != self.SYSTEM_ROOT:
stat = bucket.stat()
buckets.append(
BucketMeta(
name=bucket.name,
created_at=datetime.fromtimestamp(stat.st_ctime, timezone.utc),
)
)
return buckets
def bucket_exists(self, bucket_name: str) -> bool:
return self._bucket_path(bucket_name).exists()
def _require_bucket_exists(self, bucket_path: Path) -> None:
"""Raise BucketNotFoundError if bucket does not exist."""
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
def _validate_bucket_name(self, bucket_name: str) -> None:
if len(bucket_name) < 3 or len(bucket_name) > 63:
raise StorageError("Bucket name must be between 3 and 63 characters")
if not re.match(r"^[a-z0-9][a-z0-9.-]*[a-z0-9]$", bucket_name):
raise StorageError("Bucket name must consist of lowercase letters, numbers, periods, and hyphens, and must start and end with a letter or number")
if ".." in bucket_name:
raise StorageError("Bucket name must not contain consecutive periods")
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", bucket_name):
raise StorageError("Bucket name must not be formatted as an IP address")
def create_bucket(self, bucket_name: str) -> None:
self._validate_bucket_name(bucket_name)
bucket_path = self._bucket_path(bucket_name)
bucket_path.mkdir(parents=True, exist_ok=False)
self._system_bucket_root(bucket_path.name).mkdir(parents=True, exist_ok=True)
def bucket_stats(self, bucket_name: str, cache_ttl: int = 60) -> dict[str, int]:
"""Return object count and total size for the bucket (cached).
Args:
bucket_name: Name of the bucket
cache_ttl: Cache time-to-live in seconds (default 60)
"""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
cache_path = self._system_bucket_root(bucket_name) / "stats.json"
if cache_path.exists():
try:
if time.time() - cache_path.stat().st_mtime < cache_ttl:
return json.loads(cache_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
object_count = 0
total_bytes = 0
version_count = 0
version_bytes = 0
for path in bucket_path.rglob("*"):
if path.is_file():
rel = path.relative_to(bucket_path)
if not rel.parts:
continue
top_folder = rel.parts[0]
if top_folder not in self.INTERNAL_FOLDERS:
stat = path.stat()
object_count += 1
total_bytes += stat.st_size
versions_root = self._bucket_versions_root(bucket_name)
if versions_root.exists():
for path in versions_root.rglob("*.bin"):
if path.is_file():
stat = path.stat()
version_count += 1
version_bytes += stat.st_size
stats = {
"objects": object_count,
"bytes": total_bytes,
"version_count": version_count,
"version_bytes": version_bytes,
"total_objects": object_count + version_count,
"total_bytes": total_bytes + version_bytes,
}
try:
cache_path.parent.mkdir(parents=True, exist_ok=True)
cache_path.write_text(json.dumps(stats), encoding="utf-8")
except OSError:
pass
return stats
def _invalidate_bucket_stats_cache(self, bucket_id: str) -> None:
"""Invalidate the cached bucket statistics."""
cache_path = self._system_bucket_root(bucket_id) / "stats.json"
try:
cache_path.unlink(missing_ok=True)
except OSError:
pass
def delete_bucket(self, bucket_name: str) -> None:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
has_objects, has_versions, has_multipart = self._check_bucket_contents(bucket_path)
if has_objects:
raise StorageError("Bucket not empty")
if has_versions:
raise StorageError("Bucket contains archived object versions")
if has_multipart:
raise StorageError("Bucket has active multipart uploads")
self._remove_tree(bucket_path)
self._remove_tree(self._system_bucket_root(bucket_path.name))
self._remove_tree(self._multipart_bucket_root(bucket_path.name))
def list_objects(
self,
bucket_name: str,
*,
max_keys: int = 1000,
continuation_token: Optional[str] = None,
prefix: Optional[str] = None,
) -> ListObjectsResult:
"""List objects in a bucket with pagination support.
Args:
bucket_name: Name of the bucket
max_keys: Maximum number of objects to return (default 1000)
continuation_token: Token from previous request for pagination
prefix: Filter objects by key prefix
Returns:
ListObjectsResult with objects, truncation status, and continuation token
"""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
object_cache = self._get_object_cache(bucket_id, bucket_path)
all_keys = sorted(object_cache.keys())
if prefix:
all_keys = [k for k in all_keys if k.startswith(prefix)]
total_count = len(all_keys)
start_index = 0
if continuation_token:
try:
import bisect
start_index = bisect.bisect_right(all_keys, continuation_token)
if start_index >= total_count:
return ListObjectsResult(
objects=[],
is_truncated=False,
next_continuation_token=None,
total_count=total_count,
)
except Exception:
pass
end_index = start_index + max_keys
keys_slice = all_keys[start_index:end_index]
is_truncated = end_index < total_count
objects: List[ObjectMeta] = []
for key in keys_slice:
obj = object_cache.get(key)
if obj:
objects.append(obj)
next_token = keys_slice[-1] if is_truncated and keys_slice else None
return ListObjectsResult(
objects=objects,
is_truncated=is_truncated,
next_continuation_token=next_token,
total_count=total_count,
)
def list_objects_all(self, bucket_name: str) -> List[ObjectMeta]:
"""List all objects in a bucket (no pagination). Use with caution for large buckets."""
result = self.list_objects(bucket_name, max_keys=100000)
return result.objects
def put_object(
self,
bucket_name: str,
object_key: str,
stream: BinaryIO,
*,
metadata: Optional[Dict[str, str]] = None,
enforce_quota: bool = True,
) -> ObjectMeta:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
safe_key = self._sanitize_object_key(object_key)
destination = bucket_path / safe_key
destination.parent.mkdir(parents=True, exist_ok=True)
is_overwrite = destination.exists()
existing_size = destination.stat().st_size if is_overwrite else 0
if self._is_versioning_enabled(bucket_path) and is_overwrite:
self._archive_current_version(bucket_id, safe_key, reason="overwrite")
tmp_dir = self._system_root_path() / self.SYSTEM_TMP_DIR
tmp_dir.mkdir(parents=True, exist_ok=True)
tmp_path = tmp_dir / f"{uuid.uuid4().hex}.tmp"
try:
checksum = hashlib.md5()
with tmp_path.open("wb") as target:
shutil.copyfileobj(_HashingReader(stream, checksum), target)
new_size = tmp_path.stat().st_size
if enforce_quota:
size_delta = new_size - existing_size
object_delta = 0 if is_overwrite else 1
quota_check = self.check_quota(
bucket_name,
additional_bytes=max(0, size_delta),
additional_objects=object_delta,
)
if not quota_check["allowed"]:
raise QuotaExceededError(
quota_check["message"] or "Quota exceeded",
quota_check["quota"],
quota_check["usage"],
)
shutil.move(str(tmp_path), str(destination))
finally:
try:
tmp_path.unlink(missing_ok=True)
except OSError:
pass
stat = destination.stat()
etag = checksum.hexdigest()
internal_meta = {"__etag__": etag, "__size__": str(stat.st_size)}
combined_meta = {**internal_meta, **(metadata or {})}
self._write_metadata(bucket_id, safe_key, combined_meta)
self._invalidate_bucket_stats_cache(bucket_id)
obj_meta = ObjectMeta(
key=safe_key.as_posix(),
size=stat.st_size,
last_modified=datetime.fromtimestamp(stat.st_mtime, timezone.utc),
etag=etag,
metadata=metadata,
)
self._update_object_cache_entry(bucket_id, safe_key.as_posix(), obj_meta)
return obj_meta
def get_object_path(self, bucket_name: str, object_key: str) -> Path:
path = self._object_path(bucket_name, object_key)
if not path.exists():
raise ObjectNotFoundError("Object not found")
return path
def get_object_metadata(self, bucket_name: str, object_key: str) -> Dict[str, str]:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
return {}
safe_key = self._sanitize_object_key(object_key)
return self._read_metadata(bucket_path.name, safe_key) or {}
def _cleanup_empty_parents(self, path: Path, stop_at: Path) -> None:
"""Remove empty parent directories up to (but not including) stop_at.
On Windows/OneDrive, directories may be locked briefly after file deletion.
This method retries with a small delay to handle that case.
"""
for parent in path.parents:
if parent == stop_at:
break
for attempt in range(3):
try:
if parent.exists() and not any(parent.iterdir()):
parent.rmdir()
break
except OSError:
if attempt < 2:
time.sleep(0.1)
break
def delete_object(self, bucket_name: str, object_key: str) -> None:
bucket_path = self._bucket_path(bucket_name)
path = self._object_path(bucket_name, object_key)
if not path.exists():
return
safe_key = path.relative_to(bucket_path)
bucket_id = bucket_path.name
if self._is_versioning_enabled(bucket_path):
self._archive_current_version(bucket_id, safe_key, reason="delete")
rel = path.relative_to(bucket_path)
self._safe_unlink(path)
self._delete_metadata(bucket_id, rel)
self._invalidate_bucket_stats_cache(bucket_id)
self._update_object_cache_entry(bucket_id, safe_key.as_posix(), None)
self._cleanup_empty_parents(path, bucket_path)
def purge_object(self, bucket_name: str, object_key: str) -> None:
bucket_path = self._bucket_path(bucket_name)
target = self._object_path(bucket_name, object_key)
bucket_id = bucket_path.name
if target.exists():
rel = target.relative_to(bucket_path)
self._safe_unlink(target)
self._delete_metadata(bucket_id, rel)
else:
rel = self._sanitize_object_key(object_key)
self._delete_metadata(bucket_id, rel)
version_dir = self._version_dir(bucket_id, rel)
if version_dir.exists():
shutil.rmtree(version_dir, ignore_errors=True)
legacy_version_dir = self._legacy_version_dir(bucket_id, rel)
if legacy_version_dir.exists():
shutil.rmtree(legacy_version_dir, ignore_errors=True)
self._invalidate_bucket_stats_cache(bucket_id)
self._update_object_cache_entry(bucket_id, rel.as_posix(), None)
self._cleanup_empty_parents(target, bucket_path)
def is_versioning_enabled(self, bucket_name: str) -> bool:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
return self._is_versioning_enabled(bucket_path)
def set_bucket_versioning(self, bucket_name: str, enabled: bool) -> None:
bucket_path = self._require_bucket_path(bucket_name)
config = self._read_bucket_config(bucket_path.name)
config["versioning_enabled"] = bool(enabled)
self._write_bucket_config(bucket_path.name, config)
def get_bucket_tags(self, bucket_name: str) -> List[Dict[str, str]]:
bucket_path = self._require_bucket_path(bucket_name)
config = self._read_bucket_config(bucket_path.name)
raw_tags = config.get("tags")
if not isinstance(raw_tags, list):
return []
tags: List[Dict[str, str]] = []
for entry in raw_tags:
if not isinstance(entry, dict):
continue
key = str(entry.get("Key", "")).strip()
if not key:
continue
value = str(entry.get("Value", ""))
tags.append({"Key": key, "Value": value})
return tags
def set_bucket_tags(self, bucket_name: str, tags: Optional[List[Dict[str, str]]]) -> None:
bucket_path = self._require_bucket_path(bucket_name)
if not tags:
self._set_bucket_config_entry(bucket_path.name, "tags", None)
return
clean: List[Dict[str, str]] = []
for entry in tags:
if not isinstance(entry, dict):
continue
key = str(entry.get("Key", "")).strip()
if not key:
continue
clean.append({"Key": key, "Value": str(entry.get("Value", ""))})
self._set_bucket_config_entry(bucket_path.name, "tags", clean or None)
def get_bucket_cors(self, bucket_name: str) -> List[Dict[str, Any]]:
bucket_path = self._require_bucket_path(bucket_name)
config = self._read_bucket_config(bucket_path.name)
cors_rules = config.get("cors")
return cors_rules if isinstance(cors_rules, list) else []
def set_bucket_cors(self, bucket_name: str, rules: Optional[List[Dict[str, Any]]]) -> None:
bucket_path = self._require_bucket_path(bucket_name)
self._set_bucket_config_entry(bucket_path.name, "cors", rules or None)
def get_bucket_encryption(self, bucket_name: str) -> Dict[str, Any]:
bucket_path = self._require_bucket_path(bucket_name)
config = self._read_bucket_config(bucket_path.name)
payload = config.get("encryption")
return payload if isinstance(payload, dict) else {}
def set_bucket_encryption(self, bucket_name: str, config_payload: Optional[Dict[str, Any]]) -> None:
bucket_path = self._require_bucket_path(bucket_name)
self._set_bucket_config_entry(bucket_path.name, "encryption", config_payload or None)
def get_bucket_lifecycle(self, bucket_name: str) -> Optional[List[Dict[str, Any]]]:
"""Get lifecycle configuration for bucket."""
bucket_path = self._require_bucket_path(bucket_name)
config = self._read_bucket_config(bucket_path.name)
lifecycle = config.get("lifecycle")
return lifecycle if isinstance(lifecycle, list) else None
def set_bucket_lifecycle(self, bucket_name: str, rules: Optional[List[Dict[str, Any]]]) -> None:
"""Set lifecycle configuration for bucket."""
bucket_path = self._require_bucket_path(bucket_name)
self._set_bucket_config_entry(bucket_path.name, "lifecycle", rules)
def get_bucket_quota(self, bucket_name: str) -> Dict[str, Any]:
"""Get quota configuration for bucket.
Returns:
Dict with 'max_bytes' and 'max_objects' (None if unlimited).
"""
bucket_path = self._require_bucket_path(bucket_name)
config = self._read_bucket_config(bucket_path.name)
quota = config.get("quota")
if isinstance(quota, dict):
return {
"max_bytes": quota.get("max_bytes"),
"max_objects": quota.get("max_objects"),
}
return {"max_bytes": None, "max_objects": None}
def set_bucket_quota(
self,
bucket_name: str,
*,
max_bytes: Optional[int] = None,
max_objects: Optional[int] = None,
) -> None:
"""Set quota limits for a bucket.
Args:
bucket_name: Name of the bucket
max_bytes: Maximum total size in bytes (None to remove limit)
max_objects: Maximum number of objects (None to remove limit)
"""
bucket_path = self._require_bucket_path(bucket_name)
if max_bytes is None and max_objects is None:
self._set_bucket_config_entry(bucket_path.name, "quota", None)
return
quota: Dict[str, Any] = {}
if max_bytes is not None:
if max_bytes < 0:
raise StorageError("max_bytes must be non-negative")
quota["max_bytes"] = max_bytes
if max_objects is not None:
if max_objects < 0:
raise StorageError("max_objects must be non-negative")
quota["max_objects"] = max_objects
self._set_bucket_config_entry(bucket_path.name, "quota", quota)
def check_quota(
self,
bucket_name: str,
additional_bytes: int = 0,
additional_objects: int = 0,
) -> Dict[str, Any]:
"""Check if an operation would exceed bucket quota.
Args:
bucket_name: Name of the bucket
additional_bytes: Bytes that would be added
additional_objects: Objects that would be added
Returns:
Dict with 'allowed' (bool), 'quota' (current limits),
'usage' (current usage), and 'message' (if not allowed).
"""
quota = self.get_bucket_quota(bucket_name)
if not quota:
return {
"allowed": True,
"quota": None,
"usage": None,
"message": None,
}
stats = self.bucket_stats(bucket_name)
current_bytes = stats.get("total_bytes", stats.get("bytes", 0))
current_objects = stats.get("total_objects", stats.get("objects", 0))
result = {
"allowed": True,
"quota": quota,
"usage": {
"bytes": current_bytes,
"objects": current_objects,
"version_count": stats.get("version_count", 0),
"version_bytes": stats.get("version_bytes", 0),
},
"message": None,
}
max_bytes_limit = quota.get("max_bytes")
max_objects = quota.get("max_objects")
if max_bytes_limit is not None:
projected_bytes = current_bytes + additional_bytes
if projected_bytes > max_bytes_limit:
result["allowed"] = False
result["message"] = (
f"Quota exceeded: adding {additional_bytes} bytes would result in "
f"{projected_bytes} bytes, exceeding limit of {max_bytes_limit} bytes"
)
return result
if max_objects is not None:
projected_objects = current_objects + additional_objects
if projected_objects > max_objects:
result["allowed"] = False
result["message"] = (
f"Quota exceeded: adding {additional_objects} objects would result in "
f"{projected_objects} objects, exceeding limit of {max_objects} objects"
)
return result
return result
def get_object_tags(self, bucket_name: str, object_key: str) -> List[Dict[str, str]]:
"""Get tags for an object."""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
safe_key = self._sanitize_object_key(object_key)
object_path = bucket_path / safe_key
if not object_path.exists():
raise ObjectNotFoundError("Object does not exist")
for meta_file in (self._metadata_file(bucket_path.name, safe_key), self._legacy_metadata_file(bucket_path.name, safe_key)):
if not meta_file.exists():
continue
try:
payload = json.loads(meta_file.read_text(encoding="utf-8"))
tags = payload.get("tags")
if isinstance(tags, list):
return tags
return []
except (OSError, json.JSONDecodeError):
return []
return []
def set_object_tags(self, bucket_name: str, object_key: str, tags: Optional[List[Dict[str, str]]]) -> None:
"""Set tags for an object."""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
safe_key = self._sanitize_object_key(object_key)
object_path = bucket_path / safe_key
if not object_path.exists():
raise ObjectNotFoundError("Object does not exist")
meta_file = self._metadata_file(bucket_path.name, safe_key)
existing_payload: Dict[str, Any] = {}
if meta_file.exists():
try:
existing_payload = json.loads(meta_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
if tags:
existing_payload["tags"] = tags
else:
existing_payload.pop("tags", None)
if existing_payload.get("metadata") or existing_payload.get("tags"):
meta_file.parent.mkdir(parents=True, exist_ok=True)
meta_file.write_text(json.dumps(existing_payload), encoding="utf-8")
elif meta_file.exists():
meta_file.unlink()
parent = meta_file.parent
meta_root = self._bucket_meta_root(bucket_path.name)
while parent != meta_root and parent.exists() and not any(parent.iterdir()):
parent.rmdir()
parent = parent.parent
def delete_object_tags(self, bucket_name: str, object_key: str) -> None:
"""Delete all tags from an object."""
self.set_object_tags(bucket_name, object_key, None)
def list_object_versions(self, bucket_name: str, object_key: str) -> List[Dict[str, Any]]:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
safe_key = self._sanitize_object_key(object_key)
version_dir = self._version_dir(bucket_id, safe_key)
if not version_dir.exists():
version_dir = self._legacy_version_dir(bucket_id, safe_key)
if not version_dir.exists():
return []
versions: List[Dict[str, Any]] = []
for meta_file in version_dir.glob("*.json"):
try:
payload = json.loads(meta_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
continue
if not isinstance(payload, dict):
continue
payload.setdefault("version_id", meta_file.stem)
versions.append(payload)
versions.sort(key=lambda item: item.get("archived_at") or "1970-01-01T00:00:00Z", reverse=True)
return versions
def restore_object_version(self, bucket_name: str, object_key: str, version_id: str) -> ObjectMeta:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
safe_key = self._sanitize_object_key(object_key)
version_dir = self._version_dir(bucket_id, safe_key)
data_path = version_dir / f"{version_id}.bin"
meta_path = version_dir / f"{version_id}.json"
if not data_path.exists() or not meta_path.exists():
raise StorageError("Version not found")
try:
payload = json.loads(meta_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
payload = {}
metadata = payload.get("metadata") if isinstance(payload, dict) else {}
if not isinstance(metadata, dict):
metadata = {}
destination = bucket_path / safe_key
if self._is_versioning_enabled(bucket_path) and destination.exists():
self._archive_current_version(bucket_id, safe_key, reason="restore-overwrite")
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(data_path, destination)
if metadata:
self._write_metadata(bucket_id, safe_key, metadata)
else:
self._delete_metadata(bucket_id, safe_key)
stat = destination.stat()
self._invalidate_bucket_stats_cache(bucket_id)
return ObjectMeta(
key=safe_key.as_posix(),
size=stat.st_size,
last_modified=datetime.fromtimestamp(stat.st_mtime, timezone.utc),
etag=self._compute_etag(destination),
metadata=metadata or None,
)
def delete_object_version(self, bucket_name: str, object_key: str, version_id: str) -> None:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
safe_key = self._sanitize_object_key(object_key)
version_dir = self._version_dir(bucket_id, safe_key)
data_path = version_dir / f"{version_id}.bin"
meta_path = version_dir / f"{version_id}.json"
if not data_path.exists() and not meta_path.exists():
legacy_version_dir = self._legacy_version_dir(bucket_id, safe_key)
data_path = legacy_version_dir / f"{version_id}.bin"
meta_path = legacy_version_dir / f"{version_id}.json"
if not data_path.exists() and not meta_path.exists():
raise StorageError(f"Version {version_id} not found")
if data_path.exists():
data_path.unlink()
if meta_path.exists():
meta_path.unlink()
parent = data_path.parent
if parent.exists() and not any(parent.iterdir()):
parent.rmdir()
def list_orphaned_objects(self, bucket_name: str) -> List[Dict[str, Any]]:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
version_roots = [self._bucket_versions_root(bucket_id), self._legacy_versions_root(bucket_id)]
if not any(root.exists() for root in version_roots):
return []
aggregated: Dict[str, Dict[str, Any]] = {}
skipped: set[str] = set()
for version_root in version_roots:
if not version_root.exists():
continue
for meta_file in version_root.glob("**/*.json"):
if not meta_file.is_file():
continue
rel = meta_file.parent.relative_to(version_root)
rel_key = rel.as_posix()
if rel_key in skipped:
continue
object_path = bucket_path / rel
if object_path.exists():
skipped.add(rel_key)
continue
try:
payload = json.loads(meta_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
payload = {}
version_id = payload.get("version_id") or meta_file.stem
archived_at = payload.get("archived_at") or "1970-01-01T00:00:00Z"
size = int(payload.get("size") or 0)
reason = payload.get("reason") or "update"
record = aggregated.setdefault(
rel_key,
{
"key": rel_key,
"versions": 0,
"total_size": 0,
"latest": None,
"_latest_sort": None,
},
)
record["versions"] += 1
record["total_size"] += size
candidate = {
"version_id": version_id,
"archived_at": archived_at,
"size": size,
"reason": reason,
}
sort_key = (
archived_at,
meta_file.stat().st_mtime,
)
current_sort = record.get("_latest_sort")
if current_sort is None or sort_key > current_sort:
record["_latest_sort"] = sort_key
record["latest"] = candidate
for record in aggregated.values():
record.pop("_latest_sort", None)
return sorted(aggregated.values(), key=lambda item: item["key"])
def initiate_multipart_upload(
self,
bucket_name: str,
object_key: str,
*,
metadata: Optional[Dict[str, str]] = None,
) -> str:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
safe_key = self._sanitize_object_key(object_key)
upload_id = uuid.uuid4().hex
upload_root = self._multipart_dir(bucket_id, upload_id)
upload_root.mkdir(parents=True, exist_ok=False)
manifest = {
"upload_id": upload_id,
"object_key": safe_key.as_posix(),
"metadata": self._normalize_metadata(metadata),
"parts": {},
"created_at": _utc_isoformat(),
}
self._write_multipart_manifest(upload_root, manifest)
return upload_id
def upload_multipart_part(
self,
bucket_name: str,
upload_id: str,
part_number: int,
stream: BinaryIO,
) -> str:
"""Upload a part for a multipart upload.
Uses file locking to safely update the manifest and handle concurrent uploads.
"""
if part_number < 1 or part_number > 10000:
raise StorageError("part_number must be between 1 and 10000")
bucket_path = self._bucket_path(bucket_name)
upload_root = self._multipart_dir(bucket_path.name, upload_id)
if not upload_root.exists():
upload_root = self._legacy_multipart_dir(bucket_path.name, upload_id)
if not upload_root.exists():
raise StorageError("Multipart upload not found")
checksum = hashlib.md5()
part_filename = f"part-{part_number:05d}.part"
part_path = upload_root / part_filename
temp_path = upload_root / f".{part_filename}.tmp"
try:
with temp_path.open("wb") as target:
shutil.copyfileobj(_HashingReader(stream, checksum), target)
temp_path.replace(part_path)
except OSError:
try:
temp_path.unlink(missing_ok=True)
except OSError:
pass
raise
record = {
"etag": checksum.hexdigest(),
"size": part_path.stat().st_size,
"filename": part_filename,
}
manifest_path = upload_root / self.MULTIPART_MANIFEST
lock_path = upload_root / ".manifest.lock"
max_retries = 3
for attempt in range(max_retries):
try:
with lock_path.open("w") as lock_file:
with _file_lock(lock_file):
try:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as exc:
if attempt < max_retries - 1:
time.sleep(0.1 * (attempt + 1))
continue
raise StorageError("Multipart manifest unreadable") from exc
parts = manifest.setdefault("parts", {})
parts[str(part_number)] = record
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
break
except OSError as exc:
if attempt < max_retries - 1:
time.sleep(0.1 * (attempt + 1))
continue
raise StorageError(f"Failed to update multipart manifest: {exc}") from exc
return record["etag"]
def complete_multipart_upload(
self,
bucket_name: str,
upload_id: str,
ordered_parts: List[Dict[str, Any]],
enforce_quota: bool = True,
) -> ObjectMeta:
if not ordered_parts:
raise StorageError("parts list required")
bucket_path = self._bucket_path(bucket_name)
bucket_id = bucket_path.name
manifest, upload_root = self._load_multipart_manifest(bucket_id, upload_id)
parts_map = manifest.get("parts") or {}
if not parts_map:
raise StorageError("No uploaded parts found")
validated: List[tuple[int, Dict[str, Any]]] = []
total_size = 0
for part in ordered_parts:
raw_number = part.get("part_number")
if raw_number is None:
raw_number = part.get("PartNumber")
try:
number = int(raw_number)
except (TypeError, ValueError) as exc:
raise StorageError("Each part must include part_number") from exc
if number < 1:
raise StorageError("part numbers must be >= 1")
key = str(number)
record = parts_map.get(key)
if not record:
raise StorageError(f"Part {number} missing from upload")
raw_etag = part.get("etag", part.get("ETag", ""))
supplied_etag = str(raw_etag).strip() or record.get("etag")
if supplied_etag and record.get("etag") and supplied_etag.strip('"') != record["etag"]:
raise StorageError(f"ETag mismatch for part {number}")
validated.append((number, record))
total_size += record.get("size", 0)
validated.sort(key=lambda entry: entry[0])
safe_key = self._sanitize_object_key(manifest["object_key"])
destination = bucket_path / safe_key
is_overwrite = destination.exists()
existing_size = destination.stat().st_size if is_overwrite else 0
if enforce_quota:
size_delta = total_size - existing_size
object_delta = 0 if is_overwrite else 1
quota_check = self.check_quota(
bucket_name,
additional_bytes=max(0, size_delta),
additional_objects=object_delta,
)
if not quota_check["allowed"]:
raise QuotaExceededError(
quota_check["message"] or "Quota exceeded",
quota_check["quota"],
quota_check["usage"],
)
destination.parent.mkdir(parents=True, exist_ok=True)
lock_file_path = self._system_bucket_root(bucket_id) / "locks" / f"{safe_key.as_posix().replace('/', '_')}.lock"
lock_file_path.parent.mkdir(parents=True, exist_ok=True)
try:
with lock_file_path.open("w") as lock_file:
with _file_lock(lock_file):
if self._is_versioning_enabled(bucket_path) and destination.exists():
self._archive_current_version(bucket_id, safe_key, reason="overwrite")
checksum = hashlib.md5()
with destination.open("wb") as target:
for _, record in validated:
part_path = upload_root / record["filename"]
if not part_path.exists():
raise StorageError(f"Missing part file {record['filename']}")
with part_path.open("rb") as chunk:
while True:
data = chunk.read(1024 * 1024)
if not data:
break
checksum.update(data)
target.write(data)
except BlockingIOError:
raise StorageError("Another upload to this key is in progress")
finally:
try:
lock_file_path.unlink(missing_ok=True)
except OSError:
pass
shutil.rmtree(upload_root, ignore_errors=True)
self._invalidate_bucket_stats_cache(bucket_id)
stat = destination.stat()
etag = checksum.hexdigest()
metadata = manifest.get("metadata")
internal_meta = {"__etag__": etag, "__size__": str(stat.st_size)}
combined_meta = {**internal_meta, **(metadata or {})}
self._write_metadata(bucket_id, safe_key, combined_meta)
obj_meta = ObjectMeta(
key=safe_key.as_posix(),
size=stat.st_size,
last_modified=datetime.fromtimestamp(stat.st_mtime, timezone.utc),
etag=etag,
metadata=metadata,
)
self._update_object_cache_entry(bucket_id, safe_key.as_posix(), obj_meta)
return obj_meta
def abort_multipart_upload(self, bucket_name: str, upload_id: str) -> None:
bucket_path = self._bucket_path(bucket_name)
upload_root = self._multipart_dir(bucket_path.name, upload_id)
if upload_root.exists():
shutil.rmtree(upload_root, ignore_errors=True)
return
legacy_root = self._legacy_multipart_dir(bucket_path.name, upload_id)
if legacy_root.exists():
shutil.rmtree(legacy_root, ignore_errors=True)
def list_multipart_parts(self, bucket_name: str, upload_id: str) -> List[Dict[str, Any]]:
"""List uploaded parts for a multipart upload."""
bucket_path = self._bucket_path(bucket_name)
manifest, upload_root = self._load_multipart_manifest(bucket_path.name, upload_id)
parts = []
parts_map = manifest.get("parts", {})
for part_num_str, record in parts_map.items():
part_num = int(part_num_str)
part_filename = record.get("filename")
if not part_filename:
continue
part_path = upload_root / part_filename
if not part_path.exists():
continue
stat = part_path.stat()
parts.append({
"PartNumber": part_num,
"Size": stat.st_size,
"ETag": record.get("etag"),
"LastModified": datetime.fromtimestamp(stat.st_mtime, timezone.utc)
})
parts.sort(key=lambda x: x["PartNumber"])
return parts
def list_multipart_uploads(self, bucket_name: str, include_orphaned: bool = False) -> List[Dict[str, Any]]:
"""List all active multipart uploads for a bucket.
Args:
bucket_name: The bucket to list uploads for.
include_orphaned: If True, also include upload directories that have
files but no valid manifest.json (orphaned/interrupted uploads).
"""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise BucketNotFoundError("Bucket does not exist")
bucket_id = bucket_path.name
uploads = []
for multipart_root in (
self._multipart_bucket_root(bucket_id),
self._legacy_multipart_bucket_root(bucket_id),
):
if not multipart_root.exists():
continue
for upload_dir in multipart_root.iterdir():
if not upload_dir.is_dir():
continue
manifest_path = upload_dir / "manifest.json"
if manifest_path.exists():
try:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
uploads.append({
"upload_id": manifest.get("upload_id", upload_dir.name),
"object_key": manifest.get("object_key", ""),
"created_at": manifest.get("created_at", ""),
})
except (OSError, json.JSONDecodeError):
if include_orphaned:
has_files = any(upload_dir.rglob("*"))
if has_files:
uploads.append({
"upload_id": upload_dir.name,
"object_key": "(unknown)",
"created_at": "",
"orphaned": True,
})
elif include_orphaned:
has_files = any(f.is_file() for f in upload_dir.rglob("*"))
if has_files:
uploads.append({
"upload_id": upload_dir.name,
"object_key": "(unknown)",
"created_at": "",
"orphaned": True,
})
return uploads
def _bucket_path(self, bucket_name: str) -> Path:
safe_name = self._sanitize_bucket_name(bucket_name)
return self.root / safe_name
def _require_bucket_path(self, bucket_name: str) -> Path:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise StorageError("Bucket does not exist")
return bucket_path
def _object_path(self, bucket_name: str, object_key: str) -> Path:
bucket_path = self._bucket_path(bucket_name)
safe_key = self._sanitize_object_key(object_key)
return bucket_path / safe_key
def _system_root_path(self) -> Path:
return self.root / self.SYSTEM_ROOT
def _system_buckets_root(self) -> Path:
return self._system_root_path() / self.SYSTEM_BUCKETS_DIR
def _system_bucket_root(self, bucket_name: str) -> Path:
return self._system_buckets_root() / bucket_name
def _bucket_meta_root(self, bucket_name: str) -> Path:
return self._system_bucket_root(bucket_name) / self.BUCKET_META_DIR
def _bucket_versions_root(self, bucket_name: str) -> Path:
return self._system_bucket_root(bucket_name) / self.BUCKET_VERSIONS_DIR
def _multipart_root(self) -> Path:
return self._system_root_path() / self.SYSTEM_MULTIPART_DIR
def _multipart_bucket_root(self, bucket_name: str) -> Path:
return self._multipart_root() / bucket_name
def _legacy_metadata_file(self, bucket_name: str, key: Path) -> Path:
meta_root = self._legacy_meta_root(bucket_name)
meta_rel = Path(key.as_posix() + ".meta.json")
return meta_root / meta_rel
def _legacy_meta_root(self, bucket_name: str) -> Path:
return self._bucket_path(bucket_name) / ".meta"
def _legacy_versions_root(self, bucket_name: str) -> Path:
return self._bucket_path(bucket_name) / ".versions"
def _legacy_version_dir(self, bucket_name: str, key: Path) -> Path:
return self._legacy_versions_root(bucket_name) / key
def _legacy_multipart_bucket_root(self, bucket_name: str) -> Path:
return self._bucket_path(bucket_name) / ".multipart"
def _legacy_multipart_dir(self, bucket_name: str, upload_id: str) -> Path:
return self._legacy_multipart_bucket_root(bucket_name) / upload_id
def _fast_list_keys(self, bucket_path: Path) -> List[str]:
"""Fast directory walk using os.scandir instead of pathlib.rglob.
This is significantly faster for large directories (10K+ files).
Returns just the keys (for backward compatibility).
"""
return list(self._build_object_cache(bucket_path).keys())
def _build_object_cache(self, bucket_path: Path) -> Dict[str, ObjectMeta]:
"""Build a complete object metadata cache for a bucket.
Uses os.scandir for fast directory walking and a persistent etag index.
"""
from concurrent.futures import ThreadPoolExecutor
bucket_id = bucket_path.name
objects: Dict[str, ObjectMeta] = {}
bucket_str = str(bucket_path)
bucket_len = len(bucket_str) + 1
etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
meta_cache: Dict[str, str] = {}
index_mtime: float = 0
if etag_index_path.exists():
try:
index_mtime = etag_index_path.stat().st_mtime
with open(etag_index_path, 'r', encoding='utf-8') as f:
meta_cache = json.load(f)
except (OSError, json.JSONDecodeError):
meta_cache = {}
meta_root = self._bucket_meta_root(bucket_id)
needs_rebuild = False
if meta_root.exists() and index_mtime > 0:
def check_newer(dir_path: str) -> bool:
try:
with os.scandir(dir_path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
if check_newer(entry.path):
return True
elif entry.is_file(follow_symlinks=False) and entry.name.endswith('.meta.json'):
if entry.stat().st_mtime > index_mtime:
return True
except OSError:
pass
return False
needs_rebuild = check_newer(str(meta_root))
elif not meta_cache:
needs_rebuild = True
if needs_rebuild and meta_root.exists():
meta_str = str(meta_root)
meta_len = len(meta_str) + 1
meta_files: list[tuple[str, str]] = []
def collect_meta_files(dir_path: str) -> None:
try:
with os.scandir(dir_path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
collect_meta_files(entry.path)
elif entry.is_file(follow_symlinks=False) and entry.name.endswith('.meta.json'):
rel = entry.path[meta_len:]
key = rel[:-10].replace(os.sep, '/')
meta_files.append((key, entry.path))
except OSError:
pass
collect_meta_files(meta_str)
def read_meta_file(item: tuple[str, str]) -> tuple[str, str | None]:
key, path = item
try:
with open(path, 'rb') as f:
content = f.read()
etag_marker = b'"__etag__"'
idx = content.find(etag_marker)
if idx != -1:
start = content.find(b'"', idx + len(etag_marker) + 1)
if start != -1:
end = content.find(b'"', start + 1)
if end != -1:
return key, content[start+1:end].decode('utf-8')
return key, None
except (OSError, UnicodeDecodeError):
return key, None
if meta_files:
meta_cache = {}
with ThreadPoolExecutor(max_workers=min(64, len(meta_files))) as executor:
for key, etag in executor.map(read_meta_file, meta_files):
if etag:
meta_cache[key] = etag
try:
etag_index_path.parent.mkdir(parents=True, exist_ok=True)
with open(etag_index_path, 'w', encoding='utf-8') as f:
json.dump(meta_cache, f)
except OSError:
pass
def scan_dir(dir_path: str) -> None:
try:
with os.scandir(dir_path) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
rel_start = entry.path[bucket_len:].split(os.sep)[0] if len(entry.path) > bucket_len else entry.name
if rel_start in self.INTERNAL_FOLDERS:
continue
scan_dir(entry.path)
elif entry.is_file(follow_symlinks=False):
rel = entry.path[bucket_len:]
first_part = rel.split(os.sep)[0] if os.sep in rel else rel
if first_part in self.INTERNAL_FOLDERS:
continue
key = rel.replace(os.sep, '/')
try:
stat = entry.stat()
etag = meta_cache.get(key)
objects[key] = ObjectMeta(
key=key,
size=stat.st_size,
last_modified=datetime.fromtimestamp(stat.st_mtime, timezone.utc),
etag=etag,
metadata=None,
)
except OSError:
pass
except OSError:
pass
scan_dir(bucket_str)
return objects
def _get_object_cache(self, bucket_id: str, bucket_path: Path) -> Dict[str, ObjectMeta]:
"""Get cached object metadata for a bucket, refreshing if stale.
Uses LRU eviction to prevent unbounded cache growth.
Thread-safe with per-bucket locks to reduce contention.
"""
now = time.time()
with self._cache_lock:
cached = self._object_cache.get(bucket_id)
if cached:
objects, timestamp = cached
if now - timestamp < self._cache_ttl:
self._object_cache.move_to_end(bucket_id)
return objects
cache_version = self._cache_version.get(bucket_id, 0)
bucket_lock = self._get_bucket_lock(bucket_id)
with bucket_lock:
with self._cache_lock:
cached = self._object_cache.get(bucket_id)
if cached:
objects, timestamp = cached
if now - timestamp < self._cache_ttl:
self._object_cache.move_to_end(bucket_id)
return objects
objects = self._build_object_cache(bucket_path)
with self._cache_lock:
current_version = self._cache_version.get(bucket_id, 0)
if current_version != cache_version:
objects = self._build_object_cache(bucket_path)
while len(self._object_cache) >= self.OBJECT_CACHE_MAX_SIZE:
self._object_cache.popitem(last=False)
self._object_cache[bucket_id] = (objects, time.time())
self._object_cache.move_to_end(bucket_id)
return objects
def _invalidate_object_cache(self, bucket_id: str) -> None:
"""Invalidate the object cache and etag index for a bucket.
Increments version counter to signal stale reads.
"""
with self._cache_lock:
self._object_cache.pop(bucket_id, None)
self._cache_version[bucket_id] = self._cache_version.get(bucket_id, 0) + 1
etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
try:
etag_index_path.unlink(missing_ok=True)
except OSError:
pass
def _update_object_cache_entry(self, bucket_id: str, key: str, meta: Optional[ObjectMeta]) -> None:
"""Update a single entry in the object cache instead of invalidating the whole cache.
This is a performance optimization - lazy update instead of full invalidation.
"""
with self._cache_lock:
cached = self._object_cache.get(bucket_id)
if cached:
objects, timestamp = cached
if meta is None:
objects.pop(key, None)
else:
objects[key] = meta
def warm_cache(self, bucket_names: Optional[List[str]] = None) -> None:
"""Pre-warm the object cache for specified buckets or all buckets.
This is called on startup to ensure the first request is fast.
"""
if bucket_names is None:
bucket_names = [b.name for b in self.list_buckets()]
for bucket_name in bucket_names:
try:
bucket_path = self._bucket_path(bucket_name)
if bucket_path.exists():
self._get_object_cache(bucket_path.name, bucket_path)
except Exception:
pass
def warm_cache_async(self, bucket_names: Optional[List[str]] = None) -> threading.Thread:
"""Start cache warming in a background thread.
Returns the thread object so caller can optionally wait for it.
"""
thread = threading.Thread(
target=self.warm_cache,
args=(bucket_names,),
daemon=True,
name="cache-warmer",
)
thread.start()
return thread
def _ensure_system_roots(self) -> None:
for path in (
self._system_root_path(),
self._system_buckets_root(),
self._multipart_root(),
self._system_root_path() / self.SYSTEM_TMP_DIR,
):
path.mkdir(parents=True, exist_ok=True)
def _multipart_dir(self, bucket_name: str, upload_id: str) -> Path:
return self._multipart_bucket_root(bucket_name) / upload_id
def _version_dir(self, bucket_name: str, key: Path) -> Path:
return self._bucket_versions_root(bucket_name) / key
def _bucket_config_path(self, bucket_name: str) -> Path:
return self._system_bucket_root(bucket_name) / self.BUCKET_CONFIG_FILE
def _read_bucket_config(self, bucket_name: str) -> dict[str, Any]:
now = time.time()
cached = self._bucket_config_cache.get(bucket_name)
if cached:
config, cached_time = cached
if now - cached_time < self._bucket_config_cache_ttl:
return config.copy()
config_path = self._bucket_config_path(bucket_name)
if not config_path.exists():
self._bucket_config_cache[bucket_name] = ({}, now)
return {}
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
config = data if isinstance(data, dict) else {}
self._bucket_config_cache[bucket_name] = (config, now)
return config.copy()
except (OSError, json.JSONDecodeError):
self._bucket_config_cache[bucket_name] = ({}, now)
return {}
def _write_bucket_config(self, bucket_name: str, payload: dict[str, Any]) -> None:
config_path = self._bucket_config_path(bucket_name)
config_path.parent.mkdir(parents=True, exist_ok=True)
config_path.write_text(json.dumps(payload), encoding="utf-8")
self._bucket_config_cache[bucket_name] = (payload.copy(), time.time())
def _set_bucket_config_entry(self, bucket_name: str, key: str, value: Any | None) -> None:
config = self._read_bucket_config(bucket_name)
if value is None:
config.pop(key, None)
else:
config[key] = value
self._write_bucket_config(bucket_name, config)
def _is_versioning_enabled(self, bucket_path: Path) -> bool:
config = self._read_bucket_config(bucket_path.name)
return bool(config.get("versioning_enabled"))
def _load_multipart_manifest(self, bucket_name: str, upload_id: str) -> tuple[dict[str, Any], Path]:
upload_root = self._multipart_dir(bucket_name, upload_id)
if not upload_root.exists():
upload_root = self._legacy_multipart_dir(bucket_name, upload_id)
manifest_path = upload_root / self.MULTIPART_MANIFEST
if not manifest_path.exists():
raise StorageError("Multipart upload not found")
try:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as exc:
raise StorageError("Multipart manifest unreadable") from exc
return manifest, upload_root
def _write_multipart_manifest(self, upload_root: Path, manifest: dict[str, Any]) -> None:
manifest_path = upload_root / self.MULTIPART_MANIFEST
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
def _metadata_file(self, bucket_name: str, key: Path) -> Path:
meta_root = self._bucket_meta_root(bucket_name)
meta_rel = Path(key.as_posix() + ".meta.json")
return meta_root / meta_rel
def _normalize_metadata(self, metadata: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
if not metadata:
return None
clean = {str(k).strip(): str(v) for k, v in metadata.items() if str(k).strip()}
return clean or None
def _write_metadata(self, bucket_name: str, key: Path, metadata: Dict[str, str]) -> None:
clean = self._normalize_metadata(metadata)
if not clean:
self._delete_metadata(bucket_name, key)
return
meta_file = self._metadata_file(bucket_name, key)
meta_file.parent.mkdir(parents=True, exist_ok=True)
meta_file.write_text(json.dumps({"metadata": clean}), encoding="utf-8")
def _archive_current_version(self, bucket_name: str, key: Path, *, reason: str) -> None:
bucket_path = self._bucket_path(bucket_name)
source = bucket_path / key
if not source.exists():
return
version_dir = self._version_dir(bucket_name, key)
version_dir.mkdir(parents=True, exist_ok=True)
now = _utcnow()
version_id = f"{now.strftime('%Y%m%dT%H%M%S%fZ')}-{uuid.uuid4().hex[:8]}"
data_path = version_dir / f"{version_id}.bin"
shutil.copy2(source, data_path)
metadata = self._read_metadata(bucket_name, key)
record = {
"version_id": version_id,
"key": key.as_posix(),
"size": source.stat().st_size,
"archived_at": now.isoformat().replace("+00:00", "Z"),
"etag": self._compute_etag(source),
"metadata": metadata or {},
"reason": reason,
}
manifest_path = version_dir / f"{version_id}.json"
manifest_path.write_text(json.dumps(record), encoding="utf-8")
def _read_metadata(self, bucket_name: str, key: Path) -> Dict[str, str]:
for meta_file in (self._metadata_file(bucket_name, key), self._legacy_metadata_file(bucket_name, key)):
if not meta_file.exists():
continue
try:
payload = json.loads(meta_file.read_text(encoding="utf-8"))
data = payload.get("metadata")
return data if isinstance(data, dict) else {}
except (OSError, json.JSONDecodeError):
return {}
return {}
def _safe_unlink(self, path: Path) -> None:
attempts = 3
last_error: PermissionError | None = None
for attempt in range(attempts):
try:
path.unlink()
return
except FileNotFoundError:
return
except PermissionError as exc:
last_error = exc
if os.name == "nt":
time.sleep(0.15 * (attempt + 1))
except OSError as exc:
raise StorageError(f"Unable to delete object: {exc}") from exc
message = "Object file is currently in use. Close active previews or wait and try again."
raise StorageError(message) from last_error
def _delete_metadata(self, bucket_name: str, key: Path) -> None:
locations = (
(self._metadata_file(bucket_name, key), self._bucket_meta_root(bucket_name)),
(self._legacy_metadata_file(bucket_name, key), self._legacy_meta_root(bucket_name)),
)
for meta_file, meta_root in locations:
try:
if meta_file.exists():
meta_file.unlink()
parent = meta_file.parent
while parent != meta_root and parent.exists() and not any(parent.iterdir()):
parent.rmdir()
parent = parent.parent
except OSError:
continue
def _check_bucket_contents(self, bucket_path: Path) -> tuple[bool, bool, bool]:
"""Check bucket for objects, versions, and multipart uploads in a single pass.
Returns (has_visible_objects, has_archived_versions, has_active_multipart_uploads).
Uses early exit when all three are found.
"""
has_objects = False
has_versions = False
has_multipart = False
bucket_name = bucket_path.name
for path in bucket_path.rglob("*"):
if has_objects:
break
if not path.is_file():
continue
rel = path.relative_to(bucket_path)
if rel.parts and rel.parts[0] in self.INTERNAL_FOLDERS:
continue
has_objects = True
for version_root in (
self._bucket_versions_root(bucket_name),
self._legacy_versions_root(bucket_name),
):
if has_versions:
break
if version_root.exists():
for path in version_root.rglob("*"):
if path.is_file():
has_versions = True
break
for uploads_root in (
self._multipart_bucket_root(bucket_name),
self._legacy_multipart_bucket_root(bucket_name),
):
if has_multipart:
break
if uploads_root.exists():
for path in uploads_root.rglob("*"):
if path.is_file():
has_multipart = True
break
return has_objects, has_versions, has_multipart
def _has_visible_objects(self, bucket_path: Path) -> bool:
has_objects, _, _ = self._check_bucket_contents(bucket_path)
return has_objects
def _has_archived_versions(self, bucket_path: Path) -> bool:
_, has_versions, _ = self._check_bucket_contents(bucket_path)
return has_versions
def _has_active_multipart_uploads(self, bucket_path: Path) -> bool:
_, _, has_multipart = self._check_bucket_contents(bucket_path)
return has_multipart
def _remove_tree(self, path: Path) -> None:
if not path.exists():
return
def _handle_error(func, target_path, exc_info):
try:
os.chmod(target_path, stat.S_IRWXU)
func(target_path)
except Exception as exc:
raise StorageError(f"Unable to delete bucket contents: {exc}") from exc
try:
shutil.rmtree(path, onerror=_handle_error)
except FileNotFoundError:
return
except PermissionError as exc:
raise StorageError("Bucket in use. Close open files and try again") from exc
@staticmethod
def _sanitize_bucket_name(bucket_name: str) -> str:
if not bucket_name:
raise StorageError("Bucket name required")
name = bucket_name.lower()
if len(name) < 3 or len(name) > 63:
raise StorageError("Bucket name must be between 3 and 63 characters")
if name.startswith("-") or name.endswith("-"):
raise StorageError("Bucket name cannot start or end with a hyphen")
if ".." in name:
raise StorageError("Bucket name cannot contain consecutive periods")
if name.startswith("xn--"):
raise StorageError("Bucket name cannot start with 'xn--'")
if re.fullmatch(r"\d+\.\d+\.\d+\.\d+", name):
raise StorageError("Bucket name cannot be formatted like an IP address")
if not re.fullmatch(r"[a-z0-9][a-z0-9.-]+[a-z0-9]", name):
raise StorageError("Bucket name can contain lowercase letters, numbers, dots, and hyphens")
return name
@staticmethod
def _sanitize_object_key(object_key: str) -> Path:
if not object_key:
raise StorageError("Object key required")
if len(object_key.encode("utf-8")) > 1024:
raise StorageError("Object key exceeds maximum length of 1024 bytes")
if "\x00" in object_key:
raise StorageError("Object key contains null bytes")
if object_key.startswith(("/", "\\")):
raise StorageError("Object key cannot start with a slash")
object_key = unicodedata.normalize("NFC", object_key)
candidate = Path(object_key)
if ".." in candidate.parts:
raise StorageError("Object key contains parent directory references")
if candidate.is_absolute():
raise StorageError("Absolute object keys are not allowed")
if getattr(candidate, "drive", ""):
raise StorageError("Object key cannot include a drive letter")
parts = []
for part in candidate.parts:
if part in ("", ".", ".."):
raise StorageError("Object key contains invalid segments")
if any(ord(ch) < 32 for ch in part):
raise StorageError("Object key contains control characters")
if os.name == "nt":
if any(ch in part for ch in "<>:\"/\\|?*"):
raise StorageError("Object key contains characters not supported on Windows filesystems")
if part.endswith((" ", ".")):
raise StorageError("Object key segments cannot end with spaces or periods on Windows")
trimmed = part.upper().rstrip(". ")
if trimmed in WINDOWS_RESERVED_NAMES:
raise StorageError(f"Invalid filename segment: {part}")
parts.append(part)
if parts:
top_level = parts[0]
if top_level in ObjectStorage.INTERNAL_FOLDERS or top_level == ObjectStorage.SYSTEM_ROOT:
raise StorageError("Object key uses a reserved prefix")
return Path(*parts)
@staticmethod
def _compute_etag(path: Path) -> str:
checksum = hashlib.md5()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(8192), b""):
checksum.update(chunk)
return checksum.hexdigest()
class _HashingReader:
"""Wraps a binary stream, updating the checksum as it is read."""
def __init__(self, stream: BinaryIO, checksum: Any) -> None:
self.stream = stream
self.checksum = checksum
def read(self, size: int = -1) -> bytes:
data = self.stream.read(size)
if data:
self.checksum.update(data)
return data