Add intra-bucket cursor tracking to integrity scanner for progressive full coverage; Optimize integrity scanner: early batch exit, lazy sorted walk, cursor-aware index reads
This commit is contained in:
255
app/integrity.py
255
app/integrity.py
@@ -192,10 +192,26 @@ class IntegrityCursorStore:
|
|||||||
except OSError as e:
|
except OSError as e:
|
||||||
logger.error("Failed to save integrity cursor: %s", e)
|
logger.error("Failed to save integrity cursor: %s", e)
|
||||||
|
|
||||||
def update_bucket(self, bucket_name: str, timestamp: float) -> None:
|
def update_bucket(
|
||||||
|
self,
|
||||||
|
bucket_name: str,
|
||||||
|
timestamp: float,
|
||||||
|
last_key: Optional[str] = None,
|
||||||
|
completed: bool = False,
|
||||||
|
) -> None:
|
||||||
with self._lock:
|
with self._lock:
|
||||||
data = self.load()
|
data = self.load()
|
||||||
data["buckets"][bucket_name] = {"last_scanned": timestamp}
|
entry = data["buckets"].get(bucket_name, {})
|
||||||
|
if completed:
|
||||||
|
entry["last_scanned"] = timestamp
|
||||||
|
entry.pop("last_key", None)
|
||||||
|
entry["completed"] = True
|
||||||
|
else:
|
||||||
|
entry["last_scanned"] = timestamp
|
||||||
|
if last_key is not None:
|
||||||
|
entry["last_key"] = last_key
|
||||||
|
entry["completed"] = False
|
||||||
|
data["buckets"][bucket_name] = entry
|
||||||
self.save(data)
|
self.save(data)
|
||||||
|
|
||||||
def clean_stale(self, existing_buckets: List[str]) -> None:
|
def clean_stale(self, existing_buckets: List[str]) -> None:
|
||||||
@@ -208,17 +224,32 @@ class IntegrityCursorStore:
|
|||||||
del data["buckets"][k]
|
del data["buckets"][k]
|
||||||
self.save(data)
|
self.save(data)
|
||||||
|
|
||||||
|
def get_last_key(self, bucket_name: str) -> Optional[str]:
|
||||||
|
data = self.load()
|
||||||
|
entry = data.get("buckets", {}).get(bucket_name)
|
||||||
|
if entry is None:
|
||||||
|
return None
|
||||||
|
return entry.get("last_key")
|
||||||
|
|
||||||
def get_bucket_order(self, bucket_names: List[str]) -> List[str]:
|
def get_bucket_order(self, bucket_names: List[str]) -> List[str]:
|
||||||
data = self.load()
|
data = self.load()
|
||||||
buckets_info = data.get("buckets", {})
|
buckets_info = data.get("buckets", {})
|
||||||
|
|
||||||
def sort_key(name: str) -> float:
|
incomplete = []
|
||||||
|
complete = []
|
||||||
|
for name in bucket_names:
|
||||||
entry = buckets_info.get(name)
|
entry = buckets_info.get(name)
|
||||||
if entry is None:
|
if entry is None:
|
||||||
return 0.0
|
incomplete.append((name, 0.0))
|
||||||
return entry.get("last_scanned", 0.0)
|
elif entry.get("last_key") is not None:
|
||||||
|
incomplete.append((name, entry.get("last_scanned", 0.0)))
|
||||||
|
else:
|
||||||
|
complete.append((name, entry.get("last_scanned", 0.0)))
|
||||||
|
|
||||||
return sorted(bucket_names, key=sort_key)
|
incomplete.sort(key=lambda x: x[1])
|
||||||
|
complete.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
|
return [n for n, _ in incomplete] + [n for n, _ in complete]
|
||||||
|
|
||||||
def get_info(self) -> Dict[str, Any]:
|
def get_info(self) -> Dict[str, Any]:
|
||||||
data = self.load()
|
data = self.load()
|
||||||
@@ -226,7 +257,11 @@ class IntegrityCursorStore:
|
|||||||
return {
|
return {
|
||||||
"tracked_buckets": len(buckets),
|
"tracked_buckets": len(buckets),
|
||||||
"buckets": {
|
"buckets": {
|
||||||
name: info.get("last_scanned")
|
name: {
|
||||||
|
"last_scanned": info.get("last_scanned"),
|
||||||
|
"last_key": info.get("last_key"),
|
||||||
|
"completed": info.get("completed", False),
|
||||||
|
}
|
||||||
for name, info in buckets.items()
|
for name, info in buckets.items()
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -325,13 +360,19 @@ class IntegrityChecker:
|
|||||||
if self._batch_exhausted(result):
|
if self._batch_exhausted(result):
|
||||||
break
|
break
|
||||||
result.buckets_scanned += 1
|
result.buckets_scanned += 1
|
||||||
self._check_corrupted_objects(bucket_name, result, effective_auto_heal, effective_dry_run)
|
cursor_key = self.cursor_store.get_last_key(bucket_name)
|
||||||
self._check_orphaned_objects(bucket_name, result, effective_auto_heal, effective_dry_run)
|
key_corrupted = self._check_corrupted_objects(bucket_name, result, effective_auto_heal, effective_dry_run, cursor_key)
|
||||||
self._check_phantom_metadata(bucket_name, result, effective_auto_heal, effective_dry_run)
|
key_orphaned = self._check_orphaned_objects(bucket_name, result, effective_auto_heal, effective_dry_run, cursor_key)
|
||||||
|
key_phantom = self._check_phantom_metadata(bucket_name, result, effective_auto_heal, effective_dry_run, cursor_key)
|
||||||
self._check_stale_versions(bucket_name, result, effective_auto_heal, effective_dry_run)
|
self._check_stale_versions(bucket_name, result, effective_auto_heal, effective_dry_run)
|
||||||
self._check_etag_cache(bucket_name, result, effective_auto_heal, effective_dry_run)
|
self._check_etag_cache(bucket_name, result, effective_auto_heal, effective_dry_run)
|
||||||
self._check_legacy_metadata(bucket_name, result, effective_auto_heal, effective_dry_run)
|
self._check_legacy_metadata(bucket_name, result, effective_auto_heal, effective_dry_run)
|
||||||
self.cursor_store.update_bucket(bucket_name, time.time())
|
returned_keys = [k for k in (key_corrupted, key_orphaned, key_phantom) if k is not None]
|
||||||
|
bucket_exhausted = self._batch_exhausted(result)
|
||||||
|
if bucket_exhausted and returned_keys:
|
||||||
|
self.cursor_store.update_bucket(bucket_name, time.time(), last_key=min(returned_keys))
|
||||||
|
else:
|
||||||
|
self.cursor_store.update_bucket(bucket_name, time.time(), completed=True)
|
||||||
|
|
||||||
result.execution_time_seconds = time.time() - start
|
result.execution_time_seconds = time.time() - start
|
||||||
|
|
||||||
@@ -399,45 +440,107 @@ class IntegrityChecker:
|
|||||||
if len(result.issues) < MAX_ISSUES:
|
if len(result.issues) < MAX_ISSUES:
|
||||||
result.issues.append(issue)
|
result.issues.append(issue)
|
||||||
|
|
||||||
def _check_corrupted_objects(
|
def _collect_index_keys(
|
||||||
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
self, meta_root: Path, cursor_key: Optional[str] = None,
|
||||||
) -> None:
|
) -> Dict[str, Dict[str, Any]]:
|
||||||
bucket_path = self.storage_root / bucket_name
|
all_keys: Dict[str, Dict[str, Any]] = {}
|
||||||
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
|
|
||||||
|
|
||||||
if not meta_root.exists():
|
if not meta_root.exists():
|
||||||
return
|
return all_keys
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for index_file in meta_root.rglob("_index.json"):
|
for index_file in meta_root.rglob("_index.json"):
|
||||||
if self._throttle():
|
|
||||||
return
|
|
||||||
if self._batch_exhausted(result):
|
|
||||||
return
|
|
||||||
if not index_file.is_file():
|
if not index_file.is_file():
|
||||||
continue
|
continue
|
||||||
|
rel_dir = index_file.parent.relative_to(meta_root)
|
||||||
|
dir_prefix = "" if rel_dir == Path(".") else rel_dir.as_posix()
|
||||||
|
if cursor_key is not None and dir_prefix:
|
||||||
|
full_prefix = dir_prefix + "/"
|
||||||
|
if not cursor_key.startswith(full_prefix) and cursor_key > full_prefix:
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
index_data = json.loads(index_file.read_text(encoding="utf-8"))
|
index_data = json.loads(index_file.read_text(encoding="utf-8"))
|
||||||
except (OSError, json.JSONDecodeError):
|
except (OSError, json.JSONDecodeError):
|
||||||
continue
|
continue
|
||||||
|
for key_name, entry in index_data.items():
|
||||||
|
full_key = (dir_prefix + "/" + key_name) if dir_prefix else key_name
|
||||||
|
if cursor_key is not None and full_key <= cursor_key:
|
||||||
|
continue
|
||||||
|
all_keys[full_key] = {
|
||||||
|
"entry": entry,
|
||||||
|
"index_file": index_file,
|
||||||
|
"key_name": key_name,
|
||||||
|
}
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return all_keys
|
||||||
|
|
||||||
for key_name, entry in list(index_data.items()):
|
def _walk_bucket_files_sorted(
|
||||||
if self._throttle():
|
self, bucket_path: Path, cursor_key: Optional[str] = None,
|
||||||
|
):
|
||||||
|
def _walk(dir_path: Path, prefix: str):
|
||||||
|
try:
|
||||||
|
entries = list(os.scandir(dir_path))
|
||||||
|
except OSError:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _sort_key(e):
|
||||||
|
if e.is_dir(follow_symlinks=False):
|
||||||
|
return e.name + "/"
|
||||||
|
return e.name
|
||||||
|
|
||||||
|
entries.sort(key=_sort_key)
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
if entry.is_dir(follow_symlinks=False):
|
||||||
|
if not prefix and entry.name in self.INTERNAL_FOLDERS:
|
||||||
|
continue
|
||||||
|
new_prefix = (prefix + "/" + entry.name) if prefix else entry.name
|
||||||
|
if cursor_key is not None:
|
||||||
|
full_prefix = new_prefix + "/"
|
||||||
|
if not cursor_key.startswith(full_prefix) and cursor_key > full_prefix:
|
||||||
|
continue
|
||||||
|
yield from _walk(Path(entry.path), new_prefix)
|
||||||
|
elif entry.is_file(follow_symlinks=False):
|
||||||
|
full_key = (prefix + "/" + entry.name) if prefix else entry.name
|
||||||
|
if cursor_key is not None and full_key <= cursor_key:
|
||||||
|
continue
|
||||||
|
yield full_key
|
||||||
|
|
||||||
|
yield from _walk(bucket_path, "")
|
||||||
|
|
||||||
|
def _check_corrupted_objects(
|
||||||
|
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool,
|
||||||
|
cursor_key: Optional[str] = None,
|
||||||
|
) -> Optional[str]:
|
||||||
if self._batch_exhausted(result):
|
if self._batch_exhausted(result):
|
||||||
return
|
return None
|
||||||
|
bucket_path = self.storage_root / bucket_name
|
||||||
|
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
|
||||||
|
|
||||||
rel_dir = index_file.parent.relative_to(meta_root)
|
if not meta_root.exists():
|
||||||
if rel_dir == Path("."):
|
return None
|
||||||
full_key = key_name
|
|
||||||
else:
|
last_key = None
|
||||||
full_key = rel_dir.as_posix() + "/" + key_name
|
try:
|
||||||
|
all_keys = self._collect_index_keys(meta_root, cursor_key)
|
||||||
|
sorted_keys = sorted(all_keys.keys())
|
||||||
|
|
||||||
|
for full_key in sorted_keys:
|
||||||
|
if self._throttle():
|
||||||
|
return last_key
|
||||||
|
if self._batch_exhausted(result):
|
||||||
|
return last_key
|
||||||
|
|
||||||
|
info = all_keys[full_key]
|
||||||
|
entry = info["entry"]
|
||||||
|
index_file = info["index_file"]
|
||||||
|
key_name = info["key_name"]
|
||||||
|
|
||||||
object_path = bucket_path / full_key
|
object_path = bucket_path / full_key
|
||||||
if not object_path.exists():
|
if not object_path.exists():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result.objects_scanned += 1
|
result.objects_scanned += 1
|
||||||
|
last_key = full_key
|
||||||
|
|
||||||
meta = entry.get("metadata", {}) if isinstance(entry, dict) else {}
|
meta = entry.get("metadata", {}) if isinstance(entry, dict) else {}
|
||||||
stored_etag = meta.get("__etag__")
|
stored_etag = meta.get("__etag__")
|
||||||
@@ -464,6 +567,10 @@ class IntegrityChecker:
|
|||||||
meta["__etag__"] = actual_etag
|
meta["__etag__"] = actual_etag
|
||||||
meta["__size__"] = str(stat.st_size)
|
meta["__size__"] = str(stat.st_size)
|
||||||
meta["__last_modified__"] = str(stat.st_mtime)
|
meta["__last_modified__"] = str(stat.st_mtime)
|
||||||
|
try:
|
||||||
|
index_data = json.loads(index_file.read_text(encoding="utf-8"))
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
index_data = {}
|
||||||
index_data[key_name] = {"metadata": meta}
|
index_data[key_name] = {"metadata": meta}
|
||||||
self._atomic_write_index(index_file, index_data)
|
self._atomic_write_index(index_file, index_data)
|
||||||
issue.healed = True
|
issue.healed = True
|
||||||
@@ -475,32 +582,30 @@ class IntegrityChecker:
|
|||||||
self._add_issue(result, issue)
|
self._add_issue(result, issue)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
result.errors.append(f"check corrupted {bucket_name}: {e}")
|
result.errors.append(f"check corrupted {bucket_name}: {e}")
|
||||||
|
return last_key
|
||||||
|
|
||||||
def _check_orphaned_objects(
|
def _check_orphaned_objects(
|
||||||
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool,
|
||||||
) -> None:
|
cursor_key: Optional[str] = None,
|
||||||
|
) -> Optional[str]:
|
||||||
|
if self._batch_exhausted(result):
|
||||||
|
return None
|
||||||
bucket_path = self.storage_root / bucket_name
|
bucket_path = self.storage_root / bucket_name
|
||||||
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
|
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
|
||||||
|
|
||||||
|
last_key = None
|
||||||
try:
|
try:
|
||||||
for entry in bucket_path.rglob("*"):
|
for full_key in self._walk_bucket_files_sorted(bucket_path, cursor_key):
|
||||||
if self._throttle():
|
if self._throttle():
|
||||||
return
|
return last_key
|
||||||
if self._batch_exhausted(result):
|
if self._batch_exhausted(result):
|
||||||
return
|
return last_key
|
||||||
if not entry.is_file():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
rel = entry.relative_to(bucket_path)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
if rel.parts and rel.parts[0] in self.INTERNAL_FOLDERS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
result.objects_scanned += 1
|
result.objects_scanned += 1
|
||||||
full_key = rel.as_posix()
|
last_key = full_key
|
||||||
key_name = rel.name
|
key_path = Path(full_key)
|
||||||
parent = rel.parent
|
key_name = key_path.name
|
||||||
|
parent = key_path.parent
|
||||||
|
|
||||||
if parent == Path("."):
|
if parent == Path("."):
|
||||||
index_path = meta_root / "_index.json"
|
index_path = meta_root / "_index.json"
|
||||||
@@ -526,8 +631,9 @@ class IntegrityChecker:
|
|||||||
|
|
||||||
if auto_heal and not dry_run:
|
if auto_heal and not dry_run:
|
||||||
try:
|
try:
|
||||||
etag = _compute_etag(entry)
|
object_path = bucket_path / full_key
|
||||||
stat = entry.stat()
|
etag = _compute_etag(object_path)
|
||||||
|
stat = object_path.stat()
|
||||||
meta = {
|
meta = {
|
||||||
"__etag__": etag,
|
"__etag__": etag,
|
||||||
"__size__": str(stat.st_size),
|
"__size__": str(stat.st_size),
|
||||||
@@ -550,43 +656,38 @@ class IntegrityChecker:
|
|||||||
self._add_issue(result, issue)
|
self._add_issue(result, issue)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
result.errors.append(f"check orphaned {bucket_name}: {e}")
|
result.errors.append(f"check orphaned {bucket_name}: {e}")
|
||||||
|
return last_key
|
||||||
|
|
||||||
def _check_phantom_metadata(
|
def _check_phantom_metadata(
|
||||||
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool,
|
||||||
) -> None:
|
cursor_key: Optional[str] = None,
|
||||||
|
) -> Optional[str]:
|
||||||
|
if self._batch_exhausted(result):
|
||||||
|
return None
|
||||||
bucket_path = self.storage_root / bucket_name
|
bucket_path = self.storage_root / bucket_name
|
||||||
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
|
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
|
||||||
|
|
||||||
if not meta_root.exists():
|
if not meta_root.exists():
|
||||||
return
|
return None
|
||||||
|
|
||||||
|
last_key = None
|
||||||
try:
|
try:
|
||||||
for index_file in meta_root.rglob("_index.json"):
|
all_keys = self._collect_index_keys(meta_root, cursor_key)
|
||||||
if self._throttle():
|
sorted_keys = sorted(all_keys.keys())
|
||||||
return
|
|
||||||
if self._batch_exhausted(result):
|
|
||||||
return
|
|
||||||
if not index_file.is_file():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
index_data = json.loads(index_file.read_text(encoding="utf-8"))
|
|
||||||
except (OSError, json.JSONDecodeError):
|
|
||||||
continue
|
|
||||||
|
|
||||||
keys_to_remove = []
|
heal_by_index: Dict[Path, List[str]] = {}
|
||||||
for key_name in list(index_data.keys()):
|
|
||||||
|
for full_key in sorted_keys:
|
||||||
if self._batch_exhausted(result):
|
if self._batch_exhausted(result):
|
||||||
break
|
break
|
||||||
|
|
||||||
result.objects_scanned += 1
|
result.objects_scanned += 1
|
||||||
rel_dir = index_file.parent.relative_to(meta_root)
|
last_key = full_key
|
||||||
if rel_dir == Path("."):
|
|
||||||
full_key = key_name
|
|
||||||
else:
|
|
||||||
full_key = rel_dir.as_posix() + "/" + key_name
|
|
||||||
|
|
||||||
object_path = bucket_path / full_key
|
object_path = bucket_path / full_key
|
||||||
if not object_path.exists():
|
if not object_path.exists():
|
||||||
result.phantom_metadata += 1
|
result.phantom_metadata += 1
|
||||||
|
info = all_keys[full_key]
|
||||||
issue = IntegrityIssue(
|
issue = IntegrityIssue(
|
||||||
issue_type="phantom_metadata",
|
issue_type="phantom_metadata",
|
||||||
bucket=bucket_name,
|
bucket=bucket_name,
|
||||||
@@ -594,14 +695,17 @@ class IntegrityChecker:
|
|||||||
detail="metadata entry without file on disk",
|
detail="metadata entry without file on disk",
|
||||||
)
|
)
|
||||||
if auto_heal and not dry_run:
|
if auto_heal and not dry_run:
|
||||||
keys_to_remove.append(key_name)
|
index_file = info["index_file"]
|
||||||
|
heal_by_index.setdefault(index_file, []).append(info["key_name"])
|
||||||
issue.healed = True
|
issue.healed = True
|
||||||
issue.heal_action = "removed stale index entry"
|
issue.heal_action = "removed stale index entry"
|
||||||
result.issues_healed += 1
|
result.issues_healed += 1
|
||||||
self._add_issue(result, issue)
|
self._add_issue(result, issue)
|
||||||
|
|
||||||
if keys_to_remove and auto_heal and not dry_run:
|
if heal_by_index and auto_heal and not dry_run:
|
||||||
|
for index_file, keys_to_remove in heal_by_index.items():
|
||||||
try:
|
try:
|
||||||
|
index_data = json.loads(index_file.read_text(encoding="utf-8"))
|
||||||
for k in keys_to_remove:
|
for k in keys_to_remove:
|
||||||
index_data.pop(k, None)
|
index_data.pop(k, None)
|
||||||
if index_data:
|
if index_data:
|
||||||
@@ -612,10 +716,13 @@ class IntegrityChecker:
|
|||||||
result.errors.append(f"heal phantom {bucket_name}: {e}")
|
result.errors.append(f"heal phantom {bucket_name}: {e}")
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
result.errors.append(f"check phantom {bucket_name}: {e}")
|
result.errors.append(f"check phantom {bucket_name}: {e}")
|
||||||
|
return last_key
|
||||||
|
|
||||||
def _check_stale_versions(
|
def _check_stale_versions(
|
||||||
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if self._batch_exhausted(result):
|
||||||
|
return
|
||||||
versions_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_VERSIONS_DIR
|
versions_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_VERSIONS_DIR
|
||||||
|
|
||||||
if not versions_root.exists():
|
if not versions_root.exists():
|
||||||
@@ -682,6 +789,8 @@ class IntegrityChecker:
|
|||||||
def _check_etag_cache(
|
def _check_etag_cache(
|
||||||
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if self._batch_exhausted(result):
|
||||||
|
return
|
||||||
etag_index_path = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / "etag_index.json"
|
etag_index_path = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / "etag_index.json"
|
||||||
|
|
||||||
if not etag_index_path.exists():
|
if not etag_index_path.exists():
|
||||||
@@ -751,6 +860,8 @@ class IntegrityChecker:
|
|||||||
def _check_legacy_metadata(
|
def _check_legacy_metadata(
|
||||||
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if self._batch_exhausted(result):
|
||||||
|
return
|
||||||
legacy_meta_root = self.storage_root / bucket_name / ".meta"
|
legacy_meta_root = self.storage_root / bucket_name / ".meta"
|
||||||
if not legacy_meta_root.exists():
|
if not legacy_meta_root.exists():
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -644,5 +644,145 @@ class TestCursorRotation:
|
|||||||
after = time.time()
|
after = time.time()
|
||||||
|
|
||||||
cursor_info = checker.cursor_store.get_info()
|
cursor_info = checker.cursor_store.get_info()
|
||||||
ts = cursor_info["buckets"]["mybucket"]
|
entry = cursor_info["buckets"]["mybucket"]
|
||||||
assert before <= ts <= after
|
assert before <= entry["last_scanned"] <= after
|
||||||
|
assert entry["completed"] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestIntraBucketCursor:
|
||||||
|
def test_resumes_from_cursor_key(self, storage_root):
|
||||||
|
objects = {f"file_{chr(ord('a') + i)}.txt": f"data{i}".encode() for i in range(10)}
|
||||||
|
_setup_bucket(storage_root, "mybucket", objects)
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=3)
|
||||||
|
result1 = checker.run_now()
|
||||||
|
assert result1.objects_scanned == 3
|
||||||
|
|
||||||
|
cursor_info = checker.cursor_store.get_info()
|
||||||
|
entry = cursor_info["buckets"]["mybucket"]
|
||||||
|
assert entry["last_key"] is not None
|
||||||
|
assert entry["completed"] is False
|
||||||
|
|
||||||
|
result2 = checker.run_now()
|
||||||
|
assert result2.objects_scanned == 3
|
||||||
|
|
||||||
|
cursor_after = checker.cursor_store.get_info()["buckets"]["mybucket"]
|
||||||
|
assert cursor_after["last_key"] > entry["last_key"]
|
||||||
|
|
||||||
|
def test_cursor_resets_after_full_pass(self, storage_root):
|
||||||
|
objects = {f"file_{i}.txt": f"data{i}".encode() for i in range(3)}
|
||||||
|
_setup_bucket(storage_root, "mybucket", objects)
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=100)
|
||||||
|
checker.run_now()
|
||||||
|
|
||||||
|
cursor_info = checker.cursor_store.get_info()
|
||||||
|
entry = cursor_info["buckets"]["mybucket"]
|
||||||
|
assert entry["last_key"] is None
|
||||||
|
assert entry["completed"] is True
|
||||||
|
|
||||||
|
def test_full_coverage_across_cycles(self, storage_root):
|
||||||
|
objects = {f"obj_{chr(ord('a') + i)}.txt": f"data{i}".encode() for i in range(10)}
|
||||||
|
_setup_bucket(storage_root, "mybucket", objects)
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=4)
|
||||||
|
all_scanned = 0
|
||||||
|
for _ in range(10):
|
||||||
|
result = checker.run_now()
|
||||||
|
all_scanned += result.objects_scanned
|
||||||
|
if checker.cursor_store.get_info()["buckets"]["mybucket"]["completed"]:
|
||||||
|
break
|
||||||
|
|
||||||
|
assert all_scanned >= 10
|
||||||
|
|
||||||
|
def test_deleted_cursor_key_skips_gracefully(self, storage_root):
|
||||||
|
objects = {f"file_{chr(ord('a') + i)}.txt": f"data{i}".encode() for i in range(6)}
|
||||||
|
_setup_bucket(storage_root, "mybucket", objects)
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=3)
|
||||||
|
checker.run_now()
|
||||||
|
|
||||||
|
cursor_info = checker.cursor_store.get_info()
|
||||||
|
cursor_key = cursor_info["buckets"]["mybucket"]["last_key"]
|
||||||
|
assert cursor_key is not None
|
||||||
|
|
||||||
|
obj_path = storage_root / "mybucket" / cursor_key
|
||||||
|
meta_root = storage_root / ".myfsio.sys" / "buckets" / "mybucket" / "meta"
|
||||||
|
key_path = Path(cursor_key)
|
||||||
|
index_path = meta_root / key_path.parent / "_index.json" if key_path.parent != Path(".") else meta_root / "_index.json"
|
||||||
|
if key_path.parent == Path("."):
|
||||||
|
index_path = meta_root / "_index.json"
|
||||||
|
else:
|
||||||
|
index_path = meta_root / key_path.parent / "_index.json"
|
||||||
|
if obj_path.exists():
|
||||||
|
obj_path.unlink()
|
||||||
|
if index_path.exists():
|
||||||
|
index_data = json.loads(index_path.read_text())
|
||||||
|
index_data.pop(key_path.name, None)
|
||||||
|
index_path.write_text(json.dumps(index_data))
|
||||||
|
|
||||||
|
result2 = checker.run_now()
|
||||||
|
assert result2.objects_scanned > 0
|
||||||
|
|
||||||
|
def test_incomplete_buckets_prioritized(self, storage_root):
|
||||||
|
_setup_bucket(storage_root, "bucket-a", {f"a{i}.txt": b"a" for i in range(5)})
|
||||||
|
_setup_bucket(storage_root, "bucket-b", {f"b{i}.txt": b"b" for i in range(5)})
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=3)
|
||||||
|
checker.run_now()
|
||||||
|
|
||||||
|
cursor_info = checker.cursor_store.get_info()
|
||||||
|
incomplete = [
|
||||||
|
name for name, info in cursor_info["buckets"].items()
|
||||||
|
if info.get("last_key") is not None
|
||||||
|
]
|
||||||
|
assert len(incomplete) >= 1
|
||||||
|
|
||||||
|
result2 = checker.run_now()
|
||||||
|
assert result2.objects_scanned > 0
|
||||||
|
|
||||||
|
def test_cursor_skips_nested_directories(self, storage_root):
|
||||||
|
objects = {
|
||||||
|
"aaa/file1.txt": b"a1",
|
||||||
|
"aaa/file2.txt": b"a2",
|
||||||
|
"bbb/file1.txt": b"b1",
|
||||||
|
"bbb/file2.txt": b"b2",
|
||||||
|
"ccc/file1.txt": b"c1",
|
||||||
|
"ccc/file2.txt": b"c2",
|
||||||
|
}
|
||||||
|
_setup_bucket(storage_root, "mybucket", objects)
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=4)
|
||||||
|
result1 = checker.run_now()
|
||||||
|
assert result1.objects_scanned == 4
|
||||||
|
|
||||||
|
cursor_info = checker.cursor_store.get_info()
|
||||||
|
cursor_key = cursor_info["buckets"]["mybucket"]["last_key"]
|
||||||
|
assert cursor_key is not None
|
||||||
|
assert cursor_key.startswith("aaa/") or cursor_key.startswith("bbb/")
|
||||||
|
|
||||||
|
result2 = checker.run_now()
|
||||||
|
assert result2.objects_scanned >= 2
|
||||||
|
|
||||||
|
all_scanned = result1.objects_scanned + result2.objects_scanned
|
||||||
|
for _ in range(10):
|
||||||
|
if checker.cursor_store.get_info()["buckets"]["mybucket"]["completed"]:
|
||||||
|
break
|
||||||
|
r = checker.run_now()
|
||||||
|
all_scanned += r.objects_scanned
|
||||||
|
|
||||||
|
assert all_scanned >= 6
|
||||||
|
|
||||||
|
def test_sorted_walk_order(self, storage_root):
|
||||||
|
objects = {
|
||||||
|
"bar.txt": b"bar",
|
||||||
|
"bar/inner.txt": b"inner",
|
||||||
|
"abc.txt": b"abc",
|
||||||
|
"zzz/deep.txt": b"deep",
|
||||||
|
}
|
||||||
|
_setup_bucket(storage_root, "mybucket", objects)
|
||||||
|
|
||||||
|
checker = IntegrityChecker(storage_root=storage_root, batch_size=100)
|
||||||
|
result = checker.run_now()
|
||||||
|
assert result.objects_scanned >= 4
|
||||||
|
assert result.total_issues == 0
|
||||||
|
|||||||
Reference in New Issue
Block a user