23 Commits

Author SHA1 Message Date
4d90ead816 Merge pull request 'Fix incorrect Upgrading & Updates section in Docs' (#24) from next into main
Reviewed-on: #24
2026-02-26 09:50:17 +00:00
b37a51ed1d MyFSIO v0.3.1 Release
Reviewed-on: #23
2026-02-26 09:42:37 +00:00
0462a7b62e MyFSIO v0.3.0 Release
Reviewed-on: #22
2026-02-22 10:22:35 +00:00
52660570c1 Merge pull request 'MyFSIO v0.2.9 Release' (#21) from next into main
Reviewed-on: #21
2026-02-15 14:24:14 +00:00
35f61313e0 MyFSIO v0.2.8 Release
Reviewed-on: #20
2026-02-10 14:16:22 +00:00
c470cfb576 MyFSIO v0.2.7 Release
Reviewed-on: #19
2026-02-09 12:22:37 +00:00
jun
d96955deee MyFSIO v0.2.6 Release
Reviewed-on: #18
2026-02-05 16:18:03 +00:00
85181f0be6 Merge pull request 'MyFSIO v0.2.5 Release' (#17) from next into main
Reviewed-on: #17
2026-02-02 05:32:02 +00:00
d5ca7a8be1 Merge pull request 'MyFSIO v0.2.4 Release' (#16) from next into main
Reviewed-on: #16
2026-02-01 10:27:11 +00:00
476dc79e42 MyFSIO v0.2.3 Release
Reviewed-on: #15
2026-01-25 06:05:53 +00:00
bb6590fc5e Merge pull request 'MyFSIO v0.2.2 Release' (#14) from next into main
Reviewed-on: #14
2026-01-19 07:12:15 +00:00
899db3421b Merge pull request 'MyFSIO v0.2.1 Release' (#13) from next into main
Reviewed-on: #13
2026-01-12 08:03:29 +00:00
caf01d6ada Merge pull request 'MyFSIO v0.2.0 Release' (#12) from next into main
Reviewed-on: #12
2026-01-05 15:48:03 +00:00
bb366cb4cd Merge pull request 'MyFSIO v0.1.9 Release' (#10) from next into main
Reviewed-on: #10
2025-12-29 06:49:48 +00:00
a2745ff2ee Merge pull request 'MyFSIO v0.1.8 Release' (#9) from next into main
Reviewed-on: #9
2025-12-23 06:01:32 +00:00
28cb656d94 Merge pull request 'MyFSIO v0.1.7 Release' (#8) from next into main
Reviewed-on: #8
2025-12-22 03:10:35 +00:00
3c44152fc6 Merge pull request 'MyFSIO v0.1.6 Release' (#7) from next into main
Reviewed-on: #7
2025-12-21 06:30:21 +00:00
397515edce Merge pull request 'MyFSIO v0.1.5 Release' (#6) from next into main
Reviewed-on: #6
2025-12-13 15:41:03 +00:00
980fced7e4 Merge pull request 'MyFSIO v0.1.4 Release' (#5) from next into main
Reviewed-on: #5
2025-12-13 08:22:43 +00:00
bae5009ec4 Merge pull request 'Release v0.1.3' (#4) from next into main
Reviewed-on: #4
2025-12-03 04:14:57 +00:00
233780617f Merge pull request 'Release V0.1.2' (#3) from next into main
Reviewed-on: #3
2025-11-26 04:59:15 +00:00
fd8fb21517 Merge pull request 'Prepare for binary release' (#2) from next into main
Reviewed-on: #2
2025-11-22 12:33:38 +00:00
c6cbe822e1 Merge pull request 'Release v0.1.1' (#1) from next into main
Reviewed-on: #1
2025-11-22 12:31:27 +00:00
5 changed files with 137 additions and 1073 deletions

View File

@@ -293,7 +293,9 @@ def _verify_sigv4_header(req: Any, auth_header: str) -> Principal | None:
raise IamError("Required headers not signed")
canonical_uri = _get_canonical_uri(req)
payload_hash = req.headers.get("X-Amz-Content-Sha256") or "UNSIGNED-PAYLOAD"
payload_hash = req.headers.get("X-Amz-Content-Sha256")
if not payload_hash:
payload_hash = hashlib.sha256(req.get_data()).hexdigest()
if _HAS_RUST:
query_params = list(req.args.items(multi=True))
@@ -303,10 +305,16 @@ def _verify_sigv4_header(req: Any, auth_header: str) -> Principal | None:
header_values, payload_hash, amz_date, date_stamp, region,
service, secret_key, signature,
):
if current_app.config.get("DEBUG_SIGV4"):
logger.warning("SigV4 signature mismatch for %s %s", req.method, req.path)
raise IamError("SignatureDoesNotMatch")
else:
method = req.method
query_args = sorted(req.args.items(multi=True), key=lambda x: (x[0], x[1]))
query_args = []
for key, value in req.args.items(multi=True):
query_args.append((key, value))
query_args.sort(key=lambda x: (x[0], x[1]))
canonical_query_parts = []
for k, v in query_args:
canonical_query_parts.append(f"{quote(k, safe='-_.~')}={quote(v, safe='-_.~')}")
@@ -331,6 +339,8 @@ def _verify_sigv4_header(req: Any, auth_header: str) -> Principal | None:
string_to_sign = f"AWS4-HMAC-SHA256\n{amz_date}\n{credential_scope}\n{hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()}"
calculated_signature = hmac.new(signing_key, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
if not hmac.compare_digest(calculated_signature, signature):
if current_app.config.get("DEBUG_SIGV4"):
logger.warning("SigV4 signature mismatch for %s %s", method, req.path)
raise IamError("SignatureDoesNotMatch")
session_token = req.headers.get("X-Amz-Security-Token")
@@ -672,7 +682,7 @@ def _extract_request_metadata() -> Dict[str, str]:
for header, value in request.headers.items():
if header.lower().startswith("x-amz-meta-"):
key = header[11:]
if key and not (key.startswith("__") and key.endswith("__")):
if key:
metadata[key] = value
return metadata
@@ -1029,8 +1039,6 @@ def _apply_object_headers(
response.headers["ETag"] = f'"{etag}"'
response.headers["Accept-Ranges"] = "bytes"
for key, value in (metadata or {}).items():
if key.startswith("__") and key.endswith("__"):
continue
safe_value = _sanitize_header_value(str(value))
response.headers[f"X-Amz-Meta-{key}"] = safe_value
@@ -2459,7 +2467,7 @@ def _post_object(bucket_name: str) -> Response:
for field_name, value in request.form.items():
if field_name.lower().startswith("x-amz-meta-"):
key = field_name[11:]
if key and not (key.startswith("__") and key.endswith("__")):
if key:
metadata[key] = value
try:
meta = storage.put_object(bucket_name, object_key, file.stream, metadata=metadata or None)
@@ -3437,7 +3445,7 @@ def _copy_object(dest_bucket: str, dest_key: str, copy_source: str) -> Response:
if validation_error:
return _error_response("InvalidArgument", validation_error, 400)
else:
metadata = {k: v for k, v in source_metadata.items() if not (k.startswith("__") and k.endswith("__"))}
metadata = source_metadata
try:
with source_path.open("rb") as stream:

View File

@@ -16,7 +16,7 @@ from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path, PurePosixPath
from pathlib import Path
from typing import Any, BinaryIO, Dict, Generator, List, Optional
try:
@@ -292,43 +292,37 @@ class ObjectStorage:
bucket_str = str(bucket_path)
try:
if _HAS_RUST:
versions_root = str(self._bucket_versions_root(bucket_name))
object_count, total_bytes, version_count, version_bytes = _rc.bucket_stats_scan(
bucket_str, versions_root
)
else:
stack = [bucket_str]
while stack:
current = stack.pop()
stack = [bucket_str]
while stack:
current = stack.pop()
try:
with os.scandir(current) as it:
for entry in it:
if current == bucket_str and entry.name in internal:
continue
if entry.is_dir(follow_symlinks=False):
stack.append(entry.path)
elif entry.is_file(follow_symlinks=False):
object_count += 1
total_bytes += entry.stat(follow_symlinks=False).st_size
except PermissionError:
continue
versions_root = self._bucket_versions_root(bucket_name)
if versions_root.exists():
v_stack = [str(versions_root)]
while v_stack:
v_current = v_stack.pop()
try:
with os.scandir(current) as it:
with os.scandir(v_current) as it:
for entry in it:
if current == bucket_str and entry.name in internal:
continue
if entry.is_dir(follow_symlinks=False):
stack.append(entry.path)
elif entry.is_file(follow_symlinks=False):
object_count += 1
total_bytes += entry.stat(follow_symlinks=False).st_size
v_stack.append(entry.path)
elif entry.is_file(follow_symlinks=False) and entry.name.endswith(".bin"):
version_count += 1
version_bytes += entry.stat(follow_symlinks=False).st_size
except PermissionError:
continue
versions_root = self._bucket_versions_root(bucket_name)
if versions_root.exists():
v_stack = [str(versions_root)]
while v_stack:
v_current = v_stack.pop()
try:
with os.scandir(v_current) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
v_stack.append(entry.path)
elif entry.is_file(follow_symlinks=False) and entry.name.endswith(".bin"):
version_count += 1
version_bytes += entry.stat(follow_symlinks=False).st_size
except PermissionError:
continue
except OSError:
if cached_stats is not None:
return cached_stats
@@ -565,69 +559,47 @@ class ObjectStorage:
entries_files: list[tuple[str, int, float, Optional[str]]] = []
entries_dirs: list[str] = []
if _HAS_RUST:
try:
raw = _rc.shallow_scan(str(target_dir), prefix, json.dumps(meta_cache))
entries_files = []
for key, size, mtime, etag in raw["files"]:
if etag is None:
safe_key = PurePosixPath(key)
meta = self._read_metadata(bucket_id, Path(safe_key))
etag = meta.get("__etag__") if meta else None
entries_files.append((key, size, mtime, etag))
entries_dirs = raw["dirs"]
all_items = raw["merged_keys"]
except OSError:
return ShallowListResult(
objects=[], common_prefixes=[],
is_truncated=False, next_continuation_token=None,
)
else:
try:
with os.scandir(str(target_dir)) as it:
for entry in it:
name = entry.name
if name in self.INTERNAL_FOLDERS:
continue
if entry.is_dir(follow_symlinks=False):
cp = prefix + name + delimiter
entries_dirs.append(cp)
elif entry.is_file(follow_symlinks=False):
key = prefix + name
try:
st = entry.stat()
etag = meta_cache.get(key)
if etag is None:
safe_key = PurePosixPath(key)
meta = self._read_metadata(bucket_id, Path(safe_key))
etag = meta.get("__etag__") if meta else None
entries_files.append((key, st.st_size, st.st_mtime, etag))
except OSError:
pass
except OSError:
return ShallowListResult(
objects=[], common_prefixes=[],
is_truncated=False, next_continuation_token=None,
)
try:
with os.scandir(str(target_dir)) as it:
for entry in it:
name = entry.name
if name in self.INTERNAL_FOLDERS:
continue
if entry.is_dir(follow_symlinks=False):
cp = prefix + name + delimiter
entries_dirs.append(cp)
elif entry.is_file(follow_symlinks=False):
key = prefix + name
try:
st = entry.stat()
etag = meta_cache.get(key)
entries_files.append((key, st.st_size, st.st_mtime, etag))
except OSError:
pass
except OSError:
return ShallowListResult(
objects=[], common_prefixes=[],
is_truncated=False, next_continuation_token=None,
)
entries_dirs.sort()
entries_files.sort(key=lambda x: x[0])
entries_dirs.sort()
entries_files.sort(key=lambda x: x[0])
all_items: list[tuple[str, bool]] = []
fi, di = 0, 0
while fi < len(entries_files) and di < len(entries_dirs):
if entries_files[fi][0] <= entries_dirs[di]:
all_items.append((entries_files[fi][0], False))
fi += 1
else:
all_items.append((entries_dirs[di], True))
di += 1
while fi < len(entries_files):
all_items: list[tuple[str, bool]] = []
fi, di = 0, 0
while fi < len(entries_files) and di < len(entries_dirs):
if entries_files[fi][0] <= entries_dirs[di]:
all_items.append((entries_files[fi][0], False))
fi += 1
while di < len(entries_dirs):
else:
all_items.append((entries_dirs[di], True))
di += 1
while fi < len(entries_files):
all_items.append((entries_files[fi][0], False))
fi += 1
while di < len(entries_dirs):
all_items.append((entries_dirs[di], True))
di += 1
files_map = {e[0]: e for e in entries_files}
@@ -742,22 +714,6 @@ class ObjectStorage:
else:
search_root = bucket_path
if _HAS_RUST:
raw = _rc.search_objects_scan(
str(bucket_path), str(search_root), query, limit
)
results = [
{
"key": k,
"size": s,
"last_modified": datetime.fromtimestamp(
m, tz=timezone.utc
).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
}
for k, s, m in raw["results"]
]
return {"results": results, "truncated": raw["truncated"]}
query_lower = query.lower()
results: list[Dict[str, Any]] = []
internal = self.INTERNAL_FOLDERS
@@ -844,8 +800,9 @@ class ObjectStorage:
tmp_path = tmp_dir / f"{uuid.uuid4().hex}.tmp"
try:
checksum = hashlib.md5()
with tmp_path.open("wb") as target:
shutil.copyfileobj(stream, target)
shutil.copyfileobj(_HashingReader(stream, checksum), target)
new_size = tmp_path.stat().st_size
size_delta = new_size - existing_size
@@ -864,18 +821,6 @@ class ObjectStorage:
quota_check["usage"],
)
if _HAS_RUST:
etag = _rc.md5_file(str(tmp_path))
else:
checksum = hashlib.md5()
with tmp_path.open("rb") as f:
while True:
chunk = f.read(1048576)
if not chunk:
break
checksum.update(chunk)
etag = checksum.hexdigest()
shutil.move(str(tmp_path), str(destination))
finally:
@@ -885,6 +830,7 @@ class ObjectStorage:
pass
stat = destination.stat()
etag = checksum.hexdigest()
internal_meta = {"__etag__": etag, "__size__": str(stat.st_size)}
combined_meta = {**internal_meta, **(metadata or {})}
@@ -1323,19 +1269,13 @@ class ObjectStorage:
version_bytes_delta=archived_version_size,
version_count_delta=1 if archived_version_size > 0 else 0,
)
etag = self._compute_etag(destination)
internal_meta = {"__etag__": etag, "__size__": str(stat.st_size)}
combined_meta = {**internal_meta, **(metadata or {})}
self._write_metadata(bucket_id, safe_key, combined_meta)
obj_meta = ObjectMeta(
return ObjectMeta(
key=safe_key.as_posix(),
size=stat.st_size,
last_modified=datetime.fromtimestamp(stat.st_mtime, timezone.utc),
etag=etag,
etag=self._compute_etag(destination),
metadata=metadata or None,
)
self._update_object_cache_entry(bucket_id, safe_key.as_posix(), obj_meta)
return obj_meta
def delete_object_version(self, bucket_name: str, object_key: str, version_id: str) -> None:
bucket_path = self._bucket_path(bucket_name)
@@ -1475,24 +1415,14 @@ class ObjectStorage:
if not upload_root.exists():
raise StorageError("Multipart upload not found")
checksum = hashlib.md5()
part_filename = f"part-{part_number:05d}.part"
part_path = upload_root / part_filename
temp_path = upload_root / f".{part_filename}.tmp"
try:
with temp_path.open("wb") as target:
shutil.copyfileobj(stream, target)
if _HAS_RUST:
part_etag = _rc.md5_file(str(temp_path))
else:
checksum = hashlib.md5()
with temp_path.open("rb") as f:
while True:
chunk = f.read(1048576)
if not chunk:
break
checksum.update(chunk)
part_etag = checksum.hexdigest()
shutil.copyfileobj(_HashingReader(stream, checksum), target)
temp_path.replace(part_path)
except OSError:
try:
@@ -1502,7 +1432,7 @@ class ObjectStorage:
raise
record = {
"etag": part_etag,
"etag": checksum.hexdigest(),
"size": part_path.stat().st_size,
"filename": part_filename,
}
@@ -1902,6 +1832,10 @@ class ObjectStorage:
return list(self._build_object_cache(bucket_path).keys())
def _build_object_cache(self, bucket_path: Path) -> Dict[str, ObjectMeta]:
"""Build a complete object metadata cache for a bucket.
Uses os.scandir for fast directory walking and a persistent etag index.
"""
from concurrent.futures import ThreadPoolExecutor
bucket_id = bucket_path.name
@@ -1909,30 +1843,6 @@ class ObjectStorage:
bucket_str = str(bucket_path)
bucket_len = len(bucket_str) + 1
if _HAS_RUST:
etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
raw = _rc.build_object_cache(
bucket_str,
str(self._bucket_meta_root(bucket_id)),
str(etag_index_path),
)
if raw["etag_cache_changed"] and raw["etag_cache"]:
try:
etag_index_path.parent.mkdir(parents=True, exist_ok=True)
with open(etag_index_path, 'w', encoding='utf-8') as f:
json.dump(raw["etag_cache"], f)
except OSError:
pass
for key, size, mtime, etag in raw["objects"]:
objects[key] = ObjectMeta(
key=key,
size=size,
last_modified=datetime.fromtimestamp(mtime, timezone.utc),
etag=etag,
metadata=None,
)
return objects
etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
meta_cache: Dict[str, str] = {}
index_mtime: float = 0
@@ -2163,6 +2073,11 @@ class ObjectStorage:
return 0
def _update_object_cache_entry(self, bucket_id: str, key: str, meta: Optional[ObjectMeta]) -> None:
"""Update a single entry in the object cache instead of invalidating the whole cache.
This is a performance optimization - lazy update instead of full invalidation.
Cross-process invalidation is handled by checking stats.json mtime.
"""
with self._cache_lock:
cached = self._object_cache.get(bucket_id)
if cached:
@@ -2174,24 +2089,6 @@ class ObjectStorage:
self._cache_version[bucket_id] = self._cache_version.get(bucket_id, 0) + 1
self._sorted_key_cache.pop(bucket_id, None)
self._update_etag_index(bucket_id, key, meta.etag if meta else None)
def _update_etag_index(self, bucket_id: str, key: str, etag: Optional[str]) -> None:
etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
if not etag_index_path.exists():
return
try:
with open(etag_index_path, 'r', encoding='utf-8') as f:
index = json.load(f)
if etag is None:
index.pop(key, None)
else:
index[key] = etag
with open(etag_index_path, 'w', encoding='utf-8') as f:
json.dump(index, f)
except (OSError, json.JSONDecodeError):
pass
def warm_cache(self, bucket_names: Optional[List[str]] = None) -> None:
"""Pre-warm the object cache for specified buckets or all buckets.
@@ -2364,18 +2261,15 @@ class ObjectStorage:
index_path, entry_name = self._index_file_for_key(bucket_name, key)
lock = self._get_meta_index_lock(str(index_path))
with lock:
if _HAS_RUST:
_rc.write_index_entry(str(index_path), entry_name, json.dumps(entry))
else:
index_path.parent.mkdir(parents=True, exist_ok=True)
index_data: Dict[str, Any] = {}
if index_path.exists():
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
index_data[entry_name] = entry
index_path.write_text(json.dumps(index_data), encoding="utf-8")
index_path.parent.mkdir(parents=True, exist_ok=True)
index_data: Dict[str, Any] = {}
if index_path.exists():
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
index_data[entry_name] = entry
index_path.write_text(json.dumps(index_data), encoding="utf-8")
self._invalidate_meta_read_cache(bucket_name, key)
def _delete_index_entry(self, bucket_name: str, key: Path) -> None:
@@ -2385,23 +2279,20 @@ class ObjectStorage:
return
lock = self._get_meta_index_lock(str(index_path))
with lock:
if _HAS_RUST:
_rc.delete_index_entry(str(index_path), entry_name)
else:
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
self._invalidate_meta_read_cache(bucket_name, key)
return
if entry_name in index_data:
del index_data[entry_name]
if index_data:
index_path.write_text(json.dumps(index_data), encoding="utf-8")
else:
try:
index_path.unlink()
except OSError:
pass
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
self._invalidate_meta_read_cache(bucket_name, key)
return
if entry_name in index_data:
del index_data[entry_name]
if index_data:
index_path.write_text(json.dumps(index_data), encoding="utf-8")
else:
try:
index_path.unlink()
except OSError:
pass
self._invalidate_meta_read_cache(bucket_name, key)
def _normalize_metadata(self, metadata: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
@@ -2499,24 +2390,15 @@ class ObjectStorage:
continue
def _check_bucket_contents(self, bucket_path: Path) -> tuple[bool, bool, bool]:
bucket_name = bucket_path.name
if _HAS_RUST:
return _rc.check_bucket_contents(
str(bucket_path),
[
str(self._bucket_versions_root(bucket_name)),
str(self._legacy_versions_root(bucket_name)),
],
[
str(self._multipart_bucket_root(bucket_name)),
str(self._legacy_multipart_bucket_root(bucket_name)),
],
)
"""Check bucket for objects, versions, and multipart uploads in a single pass.
Returns (has_visible_objects, has_archived_versions, has_active_multipart_uploads).
Uses early exit when all three are found.
"""
has_objects = False
has_versions = False
has_multipart = False
bucket_name = bucket_path.name
for path in bucket_path.rglob("*"):
if has_objects:

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
APP_VERSION = "0.3.4"
APP_VERSION = "0.3.1"
def get_version() -> str:

View File

@@ -1,7 +1,6 @@
mod hashing;
mod metadata;
mod sigv4;
mod storage;
mod validation;
use pyo3::prelude::*;
@@ -30,14 +29,6 @@ mod myfsio_core {
m.add_function(wrap_pyfunction!(metadata::read_index_entry, m)?)?;
m.add_function(wrap_pyfunction!(storage::write_index_entry, m)?)?;
m.add_function(wrap_pyfunction!(storage::delete_index_entry, m)?)?;
m.add_function(wrap_pyfunction!(storage::check_bucket_contents, m)?)?;
m.add_function(wrap_pyfunction!(storage::shallow_scan, m)?)?;
m.add_function(wrap_pyfunction!(storage::bucket_stats_scan, m)?)?;
m.add_function(wrap_pyfunction!(storage::search_objects_scan, m)?)?;
m.add_function(wrap_pyfunction!(storage::build_object_cache, m)?)?;
Ok(())
}
}

View File

@@ -1,817 +0,0 @@
use pyo3::exceptions::PyIOError;
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList, PyString, PyTuple};
use serde_json::Value;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::time::SystemTime;
const INTERNAL_FOLDERS: &[&str] = &[".meta", ".versions", ".multipart"];
fn system_time_to_epoch(t: SystemTime) -> f64 {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.unwrap_or(0.0)
}
fn extract_etag_from_meta_bytes(content: &[u8]) -> Option<String> {
let marker = b"\"__etag__\"";
let idx = content.windows(marker.len()).position(|w| w == marker)?;
let after = &content[idx + marker.len()..];
let start = after.iter().position(|&b| b == b'"')? + 1;
let rest = &after[start..];
let end = rest.iter().position(|&b| b == b'"')?;
std::str::from_utf8(&rest[..end]).ok().map(|s| s.to_owned())
}
fn has_any_file(root: &str) -> bool {
let root_path = Path::new(root);
if !root_path.is_dir() {
return false;
}
let mut stack = vec![root_path.to_path_buf()];
while let Some(current) = stack.pop() {
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_file() {
return true;
}
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
}
}
}
false
}
#[pyfunction]
pub fn write_index_entry(
py: Python<'_>,
path: &str,
entry_name: &str,
entry_data_json: &str,
) -> PyResult<()> {
let path_owned = path.to_owned();
let entry_owned = entry_name.to_owned();
let data_owned = entry_data_json.to_owned();
py.detach(move || -> PyResult<()> {
let entry_value: Value = serde_json::from_str(&data_owned)
.map_err(|e| PyIOError::new_err(format!("Failed to parse entry data: {}", e)))?;
if let Some(parent) = Path::new(&path_owned).parent() {
let _ = fs::create_dir_all(parent);
}
let mut index_data: serde_json::Map<String, Value> = match fs::read_to_string(&path_owned)
{
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
Err(_) => serde_json::Map::new(),
};
index_data.insert(entry_owned, entry_value);
let serialized = serde_json::to_string(&Value::Object(index_data))
.map_err(|e| PyIOError::new_err(format!("Failed to serialize index: {}", e)))?;
fs::write(&path_owned, serialized)
.map_err(|e| PyIOError::new_err(format!("Failed to write index: {}", e)))?;
Ok(())
})
}
#[pyfunction]
pub fn delete_index_entry(py: Python<'_>, path: &str, entry_name: &str) -> PyResult<bool> {
let path_owned = path.to_owned();
let entry_owned = entry_name.to_owned();
py.detach(move || -> PyResult<bool> {
let content = match fs::read_to_string(&path_owned) {
Ok(c) => c,
Err(_) => return Ok(false),
};
let mut index_data: serde_json::Map<String, Value> =
match serde_json::from_str(&content) {
Ok(v) => v,
Err(_) => return Ok(false),
};
if index_data.remove(&entry_owned).is_none() {
return Ok(false);
}
if index_data.is_empty() {
let _ = fs::remove_file(&path_owned);
return Ok(true);
}
let serialized = serde_json::to_string(&Value::Object(index_data))
.map_err(|e| PyIOError::new_err(format!("Failed to serialize index: {}", e)))?;
fs::write(&path_owned, serialized)
.map_err(|e| PyIOError::new_err(format!("Failed to write index: {}", e)))?;
Ok(false)
})
}
#[pyfunction]
pub fn check_bucket_contents(
py: Python<'_>,
bucket_path: &str,
version_roots: Vec<String>,
multipart_roots: Vec<String>,
) -> PyResult<(bool, bool, bool)> {
let bucket_owned = bucket_path.to_owned();
py.detach(move || -> PyResult<(bool, bool, bool)> {
let mut has_objects = false;
let bucket_p = Path::new(&bucket_owned);
if bucket_p.is_dir() {
let mut stack = vec![bucket_p.to_path_buf()];
'obj_scan: while let Some(current) = stack.pop() {
let is_root = current == bucket_p;
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if is_root {
if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
}
if ft.is_file() && !ft.is_symlink() {
has_objects = true;
break 'obj_scan;
}
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
}
}
}
}
let mut has_versions = false;
for root in &version_roots {
if has_versions {
break;
}
has_versions = has_any_file(root);
}
let mut has_multipart = false;
for root in &multipart_roots {
if has_multipart {
break;
}
has_multipart = has_any_file(root);
}
Ok((has_objects, has_versions, has_multipart))
})
}
#[pyfunction]
pub fn shallow_scan(
py: Python<'_>,
target_dir: &str,
prefix: &str,
meta_cache_json: &str,
) -> PyResult<Py<PyAny>> {
let target_owned = target_dir.to_owned();
let prefix_owned = prefix.to_owned();
let cache_owned = meta_cache_json.to_owned();
let result: (
Vec<(String, u64, f64, Option<String>)>,
Vec<String>,
Vec<(String, bool)>,
) = py.detach(move || -> PyResult<(
Vec<(String, u64, f64, Option<String>)>,
Vec<String>,
Vec<(String, bool)>,
)> {
let meta_cache: HashMap<String, String> =
serde_json::from_str(&cache_owned).unwrap_or_default();
let mut files: Vec<(String, u64, f64, Option<String>)> = Vec::new();
let mut dirs: Vec<String> = Vec::new();
let entries = match fs::read_dir(&target_owned) {
Ok(e) => e,
Err(_) => return Ok((files, dirs, Vec::new())),
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let name = match entry.file_name().into_string() {
Ok(n) => n,
Err(_) => continue,
};
if INTERNAL_FOLDERS.contains(&name.as_str()) {
continue;
}
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
let cp = format!("{}{}/", prefix_owned, name);
dirs.push(cp);
} else if ft.is_file() && !ft.is_symlink() {
let key = format!("{}{}", prefix_owned, name);
let md = match entry.metadata() {
Ok(m) => m,
Err(_) => continue,
};
let size = md.len();
let mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
let etag = meta_cache.get(&key).cloned();
files.push((key, size, mtime, etag));
}
}
files.sort_by(|a, b| a.0.cmp(&b.0));
dirs.sort();
let mut merged: Vec<(String, bool)> = Vec::with_capacity(files.len() + dirs.len());
let mut fi = 0;
let mut di = 0;
while fi < files.len() && di < dirs.len() {
if files[fi].0 <= dirs[di] {
merged.push((files[fi].0.clone(), false));
fi += 1;
} else {
merged.push((dirs[di].clone(), true));
di += 1;
}
}
while fi < files.len() {
merged.push((files[fi].0.clone(), false));
fi += 1;
}
while di < dirs.len() {
merged.push((dirs[di].clone(), true));
di += 1;
}
Ok((files, dirs, merged))
})?;
let (files, dirs, merged) = result;
let dict = PyDict::new(py);
let files_list = PyList::empty(py);
for (key, size, mtime, etag) in &files {
let etag_py: Py<PyAny> = match etag {
Some(e) => PyString::new(py, e).into_any().unbind(),
None => py.None(),
};
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
size.into_pyobject(py)?.into_any().unbind(),
mtime.into_pyobject(py)?.into_any().unbind(),
etag_py,
])?;
files_list.append(tuple)?;
}
dict.set_item("files", files_list)?;
let dirs_list = PyList::empty(py);
for d in &dirs {
dirs_list.append(PyString::new(py, d))?;
}
dict.set_item("dirs", dirs_list)?;
let merged_list = PyList::empty(py);
for (key, is_dir) in &merged {
let bool_obj: Py<PyAny> = if *is_dir {
true.into_pyobject(py)?.to_owned().into_any().unbind()
} else {
false.into_pyobject(py)?.to_owned().into_any().unbind()
};
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
bool_obj,
])?;
merged_list.append(tuple)?;
}
dict.set_item("merged_keys", merged_list)?;
Ok(dict.into_any().unbind())
}
#[pyfunction]
pub fn bucket_stats_scan(
py: Python<'_>,
bucket_path: &str,
versions_root: &str,
) -> PyResult<(u64, u64, u64, u64)> {
let bucket_owned = bucket_path.to_owned();
let versions_owned = versions_root.to_owned();
py.detach(move || -> PyResult<(u64, u64, u64, u64)> {
let mut object_count: u64 = 0;
let mut total_bytes: u64 = 0;
let bucket_p = Path::new(&bucket_owned);
if bucket_p.is_dir() {
let mut stack = vec![bucket_p.to_path_buf()];
while let Some(current) = stack.pop() {
let is_root = current == bucket_p;
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
if is_root {
if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
}
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
} else if ft.is_file() && !ft.is_symlink() {
object_count += 1;
if let Ok(md) = entry.metadata() {
total_bytes += md.len();
}
}
}
}
}
let mut version_count: u64 = 0;
let mut version_bytes: u64 = 0;
let versions_p = Path::new(&versions_owned);
if versions_p.is_dir() {
let mut stack = vec![versions_p.to_path_buf()];
while let Some(current) = stack.pop() {
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
} else if ft.is_file() && !ft.is_symlink() {
if let Some(name) = entry.file_name().to_str() {
if name.ends_with(".bin") {
version_count += 1;
if let Ok(md) = entry.metadata() {
version_bytes += md.len();
}
}
}
}
}
}
}
Ok((object_count, total_bytes, version_count, version_bytes))
})
}
#[pyfunction]
#[pyo3(signature = (bucket_path, search_root, query, limit))]
pub fn search_objects_scan(
py: Python<'_>,
bucket_path: &str,
search_root: &str,
query: &str,
limit: usize,
) -> PyResult<Py<PyAny>> {
let bucket_owned = bucket_path.to_owned();
let search_owned = search_root.to_owned();
let query_owned = query.to_owned();
let result: (Vec<(String, u64, f64)>, bool) = py.detach(
move || -> PyResult<(Vec<(String, u64, f64)>, bool)> {
let query_lower = query_owned.to_lowercase();
let bucket_len = bucket_owned.len() + 1;
let scan_limit = limit * 4;
let mut matched: usize = 0;
let mut results: Vec<(String, u64, f64)> = Vec::new();
let search_p = Path::new(&search_owned);
if !search_p.is_dir() {
return Ok((results, false));
}
let bucket_p = Path::new(&bucket_owned);
let mut stack = vec![search_p.to_path_buf()];
'scan: while let Some(current) = stack.pop() {
let is_bucket_root = current == bucket_p;
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
if is_bucket_root {
if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
}
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
} else if ft.is_file() && !ft.is_symlink() {
let full_path = entry.path();
let full_str = full_path.to_string_lossy();
if full_str.len() <= bucket_len {
continue;
}
let key = full_str[bucket_len..].replace('\\', "/");
if key.to_lowercase().contains(&query_lower) {
if let Ok(md) = entry.metadata() {
let size = md.len();
let mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
results.push((key, size, mtime));
matched += 1;
}
}
if matched >= scan_limit {
break 'scan;
}
}
}
}
results.sort_by(|a, b| a.0.cmp(&b.0));
let truncated = results.len() > limit;
results.truncate(limit);
Ok((results, truncated))
},
)?;
let (results, truncated) = result;
let dict = PyDict::new(py);
let results_list = PyList::empty(py);
for (key, size, mtime) in &results {
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
size.into_pyobject(py)?.into_any().unbind(),
mtime.into_pyobject(py)?.into_any().unbind(),
])?;
results_list.append(tuple)?;
}
dict.set_item("results", results_list)?;
dict.set_item("truncated", truncated)?;
Ok(dict.into_any().unbind())
}
#[pyfunction]
pub fn build_object_cache(
py: Python<'_>,
bucket_path: &str,
meta_root: &str,
etag_index_path: &str,
) -> PyResult<Py<PyAny>> {
let bucket_owned = bucket_path.to_owned();
let meta_owned = meta_root.to_owned();
let index_path_owned = etag_index_path.to_owned();
let result: (HashMap<String, String>, Vec<(String, u64, f64, Option<String>)>, bool) =
py.detach(move || -> PyResult<(
HashMap<String, String>,
Vec<(String, u64, f64, Option<String>)>,
bool,
)> {
let mut meta_cache: HashMap<String, String> = HashMap::new();
let mut index_mtime: f64 = 0.0;
let mut etag_cache_changed = false;
let index_p = Path::new(&index_path_owned);
if index_p.is_file() {
if let Ok(md) = fs::metadata(&index_path_owned) {
index_mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
}
if let Ok(content) = fs::read_to_string(&index_path_owned) {
if let Ok(parsed) = serde_json::from_str::<HashMap<String, String>>(&content) {
meta_cache = parsed;
}
}
}
let meta_p = Path::new(&meta_owned);
let mut needs_rebuild = false;
if meta_p.is_dir() && index_mtime > 0.0 {
fn check_newer(dir: &Path, index_mtime: f64) -> bool {
let entries = match fs::read_dir(dir) {
Ok(e) => e,
Err(_) => return false,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
if check_newer(&entry.path(), index_mtime) {
return true;
}
} else if ft.is_file() {
if let Some(name) = entry.file_name().to_str() {
if name.ends_with(".meta.json") || name == "_index.json" {
if let Ok(md) = entry.metadata() {
let mt = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
if mt > index_mtime {
return true;
}
}
}
}
}
}
false
}
needs_rebuild = check_newer(meta_p, index_mtime);
} else if meta_cache.is_empty() {
needs_rebuild = true;
}
if needs_rebuild && meta_p.is_dir() {
let meta_str = meta_owned.clone();
let meta_len = meta_str.len() + 1;
let mut index_files: Vec<String> = Vec::new();
let mut legacy_meta_files: Vec<(String, String)> = Vec::new();
fn collect_meta(
dir: &Path,
meta_len: usize,
index_files: &mut Vec<String>,
legacy_meta_files: &mut Vec<(String, String)>,
) {
let entries = match fs::read_dir(dir) {
Ok(e) => e,
Err(_) => return,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
collect_meta(&entry.path(), meta_len, index_files, legacy_meta_files);
} else if ft.is_file() {
if let Some(name) = entry.file_name().to_str() {
let full = entry.path().to_string_lossy().to_string();
if name == "_index.json" {
index_files.push(full);
} else if name.ends_with(".meta.json") {
if full.len() > meta_len {
let rel = &full[meta_len..];
let key = if rel.len() > 10 {
rel[..rel.len() - 10].replace('\\', "/")
} else {
continue;
};
legacy_meta_files.push((key, full));
}
}
}
}
}
}
collect_meta(
meta_p,
meta_len,
&mut index_files,
&mut legacy_meta_files,
);
meta_cache.clear();
for idx_path in &index_files {
if let Ok(content) = fs::read_to_string(idx_path) {
if let Ok(idx_data) = serde_json::from_str::<HashMap<String, Value>>(&content) {
let rel_dir = if idx_path.len() > meta_len {
let r = &idx_path[meta_len..];
r.replace('\\', "/")
} else {
String::new()
};
let dir_prefix = if rel_dir.ends_with("/_index.json") {
&rel_dir[..rel_dir.len() - "/_index.json".len()]
} else {
""
};
for (entry_name, entry_data) in &idx_data {
let key = if dir_prefix.is_empty() {
entry_name.clone()
} else {
format!("{}/{}", dir_prefix, entry_name)
};
if let Some(meta_obj) = entry_data.get("metadata") {
if let Some(etag) = meta_obj.get("__etag__") {
if let Some(etag_str) = etag.as_str() {
meta_cache.insert(key, etag_str.to_owned());
}
}
}
}
}
}
}
for (key, path) in &legacy_meta_files {
if meta_cache.contains_key(key) {
continue;
}
if let Ok(content) = fs::read(path) {
if let Some(etag) = extract_etag_from_meta_bytes(&content) {
meta_cache.insert(key.clone(), etag);
}
}
}
etag_cache_changed = true;
}
let bucket_p = Path::new(&bucket_owned);
let bucket_len = bucket_owned.len() + 1;
let mut objects: Vec<(String, u64, f64, Option<String>)> = Vec::new();
if bucket_p.is_dir() {
let mut stack = vec![bucket_p.to_path_buf()];
while let Some(current) = stack.pop() {
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
let full = entry.path();
let full_str = full.to_string_lossy();
if full_str.len() > bucket_len {
let first_part: &str = if let Some(sep_pos) =
full_str[bucket_len..].find(|c: char| c == '\\' || c == '/')
{
&full_str[bucket_len..bucket_len + sep_pos]
} else {
&full_str[bucket_len..]
};
if INTERNAL_FOLDERS.contains(&first_part) {
continue;
}
} else if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
stack.push(full);
} else if ft.is_file() && !ft.is_symlink() {
let full = entry.path();
let full_str = full.to_string_lossy();
if full_str.len() <= bucket_len {
continue;
}
let rel = &full_str[bucket_len..];
let first_part: &str =
if let Some(sep_pos) = rel.find(|c: char| c == '\\' || c == '/') {
&rel[..sep_pos]
} else {
rel
};
if INTERNAL_FOLDERS.contains(&first_part) {
continue;
}
let key = rel.replace('\\', "/");
if let Ok(md) = entry.metadata() {
let size = md.len();
let mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
let etag = meta_cache.get(&key).cloned();
objects.push((key, size, mtime, etag));
}
}
}
}
}
Ok((meta_cache, objects, etag_cache_changed))
})?;
let (meta_cache, objects, etag_cache_changed) = result;
let dict = PyDict::new(py);
let cache_dict = PyDict::new(py);
for (k, v) in &meta_cache {
cache_dict.set_item(k, v)?;
}
dict.set_item("etag_cache", cache_dict)?;
let objects_list = PyList::empty(py);
for (key, size, mtime, etag) in &objects {
let etag_py: Py<PyAny> = match etag {
Some(e) => PyString::new(py, e).into_any().unbind(),
None => py.None(),
};
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
size.into_pyobject(py)?.into_any().unbind(),
mtime.into_pyobject(py)?.into_any().unbind(),
etag_py,
])?;
objects_list.append(tuple)?;
}
dict.set_item("objects", objects_list)?;
dict.set_item("etag_cache_changed", etag_cache_changed)?;
Ok(dict.into_any().unbind())
}