Add garbage collection: background cleanup of orphaned temp files, multipart uploads, lock files, metadata, versions, and empty directories

This commit is contained in:
2026-03-09 17:34:21 +08:00
parent 80b77b64eb
commit de5377e5ac
7 changed files with 1149 additions and 18 deletions

View File

@@ -29,6 +29,7 @@ from .encryption import EncryptionManager
from .extensions import limiter, csrf
from .iam import IamService
from .kms import KMSManager
from .gc import GarbageCollector
from .lifecycle import LifecycleManager
from .notifications import NotificationService
from .object_lock import ObjectLockService
@@ -221,6 +222,18 @@ def create_app(
)
lifecycle_manager.start()
gc_collector = None
if app.config.get("GC_ENABLED", False):
gc_collector = GarbageCollector(
storage_root=storage_root,
interval_hours=app.config.get("GC_INTERVAL_HOURS", 6.0),
temp_file_max_age_hours=app.config.get("GC_TEMP_FILE_MAX_AGE_HOURS", 24.0),
multipart_max_age_days=app.config.get("GC_MULTIPART_MAX_AGE_DAYS", 7),
lock_file_max_age_hours=app.config.get("GC_LOCK_FILE_MAX_AGE_HOURS", 1.0),
dry_run=app.config.get("GC_DRY_RUN", False),
)
gc_collector.start()
app.extensions["object_storage"] = storage
app.extensions["iam"] = iam
app.extensions["bucket_policies"] = bucket_policies
@@ -232,6 +245,7 @@ def create_app(
app.extensions["kms"] = kms_manager
app.extensions["acl"] = acl_service
app.extensions["lifecycle"] = lifecycle_manager
app.extensions["gc"] = gc_collector
app.extensions["object_lock"] = object_lock_service
app.extensions["notifications"] = notification_service
app.extensions["access_logging"] = access_logging_service

View File

@@ -14,6 +14,7 @@ from flask import Blueprint, Response, current_app, jsonify, request
from .connections import ConnectionStore
from .extensions import limiter
from .gc import GarbageCollector
from .iam import IamError, Principal
from .replication import ReplicationManager
from .site_registry import PeerSite, SiteInfo, SiteRegistry
@@ -776,3 +777,55 @@ def delete_website_domain(domain: str):
return _json_error("NotFound", f"No mapping found for domain '{domain}'", 404)
logger.info("Website domain mapping deleted: %s", domain)
return Response(status=204)
def _gc() -> Optional[GarbageCollector]:
return current_app.extensions.get("gc")
@admin_api_bp.route("/gc/status", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def gc_status():
principal, error = _require_admin()
if error:
return error
gc = _gc()
if not gc:
return jsonify({"enabled": False, "message": "GC is not enabled. Set GC_ENABLED=true to enable."})
return jsonify(gc.get_status())
@admin_api_bp.route("/gc/run", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def gc_run_now():
principal, error = _require_admin()
if error:
return error
gc = _gc()
if not gc:
return _json_error("InvalidRequest", "GC is not enabled", 400)
payload = request.get_json(silent=True) or {}
original_dry_run = gc.dry_run
if "dry_run" in payload:
gc.dry_run = bool(payload["dry_run"])
try:
result = gc.run_now()
finally:
gc.dry_run = original_dry_run
logger.info("GC manual run by %s", principal.access_key)
return jsonify(result.to_dict())
@admin_api_bp.route("/gc/history", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def gc_history():
principal, error = _require_admin()
if error:
return error
gc = _gc()
if not gc:
return jsonify({"executions": []})
limit = min(int(request.args.get("limit", 50)), 200)
offset = int(request.args.get("offset", 0))
records = gc.get_history(limit=limit, offset=offset)
return jsonify({"executions": records})

View File

@@ -150,6 +150,12 @@ class AppConfig:
allowed_redirect_hosts: list[str]
allow_internal_endpoints: bool
website_hosting_enabled: bool
gc_enabled: bool
gc_interval_hours: float
gc_temp_file_max_age_hours: float
gc_multipart_max_age_days: int
gc_lock_file_max_age_hours: float
gc_dry_run: bool
@classmethod
def from_env(cls, overrides: Optional[Dict[str, Any]] = None) -> "AppConfig":
@@ -319,6 +325,12 @@ class AppConfig:
allowed_redirect_hosts = [h.strip() for h in str(allowed_redirect_hosts_raw).split(",") if h.strip()]
allow_internal_endpoints = str(_get("ALLOW_INTERNAL_ENDPOINTS", "0")).lower() in {"1", "true", "yes", "on"}
website_hosting_enabled = str(_get("WEBSITE_HOSTING_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
gc_enabled = str(_get("GC_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
gc_interval_hours = float(_get("GC_INTERVAL_HOURS", 6.0))
gc_temp_file_max_age_hours = float(_get("GC_TEMP_FILE_MAX_AGE_HOURS", 24.0))
gc_multipart_max_age_days = int(_get("GC_MULTIPART_MAX_AGE_DAYS", 7))
gc_lock_file_max_age_hours = float(_get("GC_LOCK_FILE_MAX_AGE_HOURS", 1.0))
gc_dry_run = str(_get("GC_DRY_RUN", "0")).lower() in {"1", "true", "yes", "on"}
return cls(storage_root=storage_root,
max_upload_size=max_upload_size,
@@ -406,7 +418,13 @@ class AppConfig:
num_trusted_proxies=num_trusted_proxies,
allowed_redirect_hosts=allowed_redirect_hosts,
allow_internal_endpoints=allow_internal_endpoints,
website_hosting_enabled=website_hosting_enabled)
website_hosting_enabled=website_hosting_enabled,
gc_enabled=gc_enabled,
gc_interval_hours=gc_interval_hours,
gc_temp_file_max_age_hours=gc_temp_file_max_age_hours,
gc_multipart_max_age_days=gc_multipart_max_age_days,
gc_lock_file_max_age_hours=gc_lock_file_max_age_hours,
gc_dry_run=gc_dry_run)
def validate_and_report(self) -> list[str]:
"""Validate configuration and return a list of warnings/issues.
@@ -617,4 +635,10 @@ class AppConfig:
"ALLOWED_REDIRECT_HOSTS": self.allowed_redirect_hosts,
"ALLOW_INTERNAL_ENDPOINTS": self.allow_internal_endpoints,
"WEBSITE_HOSTING_ENABLED": self.website_hosting_enabled,
"GC_ENABLED": self.gc_enabled,
"GC_INTERVAL_HOURS": self.gc_interval_hours,
"GC_TEMP_FILE_MAX_AGE_HOURS": self.gc_temp_file_max_age_hours,
"GC_MULTIPART_MAX_AGE_DAYS": self.gc_multipart_max_age_days,
"GC_LOCK_FILE_MAX_AGE_HOURS": self.gc_lock_file_max_age_hours,
"GC_DRY_RUN": self.gc_dry_run,
}

531
app/gc.py Normal file
View File

@@ -0,0 +1,531 @@
from __future__ import annotations
import json
import logging
import os
import shutil
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class GCResult:
temp_files_deleted: int = 0
temp_bytes_freed: int = 0
multipart_uploads_deleted: int = 0
multipart_bytes_freed: int = 0
lock_files_deleted: int = 0
orphaned_metadata_deleted: int = 0
orphaned_versions_deleted: int = 0
orphaned_version_bytes_freed: int = 0
empty_dirs_removed: int = 0
errors: List[str] = field(default_factory=list)
execution_time_seconds: float = 0.0
def to_dict(self) -> dict:
return {
"temp_files_deleted": self.temp_files_deleted,
"temp_bytes_freed": self.temp_bytes_freed,
"multipart_uploads_deleted": self.multipart_uploads_deleted,
"multipart_bytes_freed": self.multipart_bytes_freed,
"lock_files_deleted": self.lock_files_deleted,
"orphaned_metadata_deleted": self.orphaned_metadata_deleted,
"orphaned_versions_deleted": self.orphaned_versions_deleted,
"orphaned_version_bytes_freed": self.orphaned_version_bytes_freed,
"empty_dirs_removed": self.empty_dirs_removed,
"errors": self.errors,
"execution_time_seconds": self.execution_time_seconds,
}
@property
def total_bytes_freed(self) -> int:
return self.temp_bytes_freed + self.multipart_bytes_freed + self.orphaned_version_bytes_freed
@property
def has_work(self) -> bool:
return (
self.temp_files_deleted > 0
or self.multipart_uploads_deleted > 0
or self.lock_files_deleted > 0
or self.orphaned_metadata_deleted > 0
or self.orphaned_versions_deleted > 0
or self.empty_dirs_removed > 0
)
@dataclass
class GCExecutionRecord:
timestamp: float
result: dict
dry_run: bool
def to_dict(self) -> dict:
return {
"timestamp": self.timestamp,
"result": self.result,
"dry_run": self.dry_run,
}
@classmethod
def from_dict(cls, data: dict) -> GCExecutionRecord:
return cls(
timestamp=data["timestamp"],
result=data["result"],
dry_run=data.get("dry_run", False),
)
class GCHistoryStore:
def __init__(self, storage_root: Path, max_records: int = 50) -> None:
self.storage_root = storage_root
self.max_records = max_records
self._lock = threading.Lock()
def _get_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "gc_history.json"
def load(self) -> List[GCExecutionRecord]:
path = self._get_path()
if not path.exists():
return []
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return [GCExecutionRecord.from_dict(d) for d in data.get("executions", [])]
except (OSError, ValueError, KeyError) as e:
logger.error("Failed to load GC history: %s", e)
return []
def save(self, records: List[GCExecutionRecord]) -> None:
path = self._get_path()
path.parent.mkdir(parents=True, exist_ok=True)
data = {"executions": [r.to_dict() for r in records[: self.max_records]]}
try:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error("Failed to save GC history: %s", e)
def add(self, record: GCExecutionRecord) -> None:
with self._lock:
records = self.load()
records.insert(0, record)
self.save(records)
def get_history(self, limit: int = 50, offset: int = 0) -> List[GCExecutionRecord]:
return self.load()[offset : offset + limit]
def _dir_size(path: Path) -> int:
total = 0
try:
for f in path.rglob("*"):
if f.is_file():
try:
total += f.stat().st_size
except OSError:
pass
except OSError:
pass
return total
def _file_age_hours(path: Path) -> float:
try:
mtime = path.stat().st_mtime
return (time.time() - mtime) / 3600.0
except OSError:
return 0.0
class GarbageCollector:
SYSTEM_ROOT = ".myfsio.sys"
SYSTEM_TMP_DIR = "tmp"
SYSTEM_MULTIPART_DIR = "multipart"
SYSTEM_BUCKETS_DIR = "buckets"
BUCKET_META_DIR = "meta"
BUCKET_VERSIONS_DIR = "versions"
INTERNAL_FOLDERS = {".meta", ".versions", ".multipart"}
def __init__(
self,
storage_root: Path,
interval_hours: float = 6.0,
temp_file_max_age_hours: float = 24.0,
multipart_max_age_days: int = 7,
lock_file_max_age_hours: float = 1.0,
dry_run: bool = False,
max_history: int = 50,
) -> None:
self.storage_root = Path(storage_root)
self.interval_seconds = interval_hours * 3600.0
self.temp_file_max_age_hours = temp_file_max_age_hours
self.multipart_max_age_days = multipart_max_age_days
self.lock_file_max_age_hours = lock_file_max_age_hours
self.dry_run = dry_run
self._timer: Optional[threading.Timer] = None
self._shutdown = False
self._lock = threading.Lock()
self.history_store = GCHistoryStore(storage_root, max_records=max_history)
def start(self) -> None:
if self._timer is not None:
return
self._shutdown = False
self._schedule_next()
logger.info(
"GC started: interval=%.1fh, temp_max_age=%.1fh, multipart_max_age=%dd, lock_max_age=%.1fh, dry_run=%s",
self.interval_seconds / 3600.0,
self.temp_file_max_age_hours,
self.multipart_max_age_days,
self.lock_file_max_age_hours,
self.dry_run,
)
def stop(self) -> None:
self._shutdown = True
if self._timer:
self._timer.cancel()
self._timer = None
logger.info("GC stopped")
def _schedule_next(self) -> None:
if self._shutdown:
return
self._timer = threading.Timer(self.interval_seconds, self._run_cycle)
self._timer.daemon = True
self._timer.start()
def _run_cycle(self) -> None:
if self._shutdown:
return
try:
self.run_now()
except Exception as e:
logger.error("GC cycle failed: %s", e)
finally:
self._schedule_next()
def run_now(self) -> GCResult:
start = time.time()
result = GCResult()
self._clean_temp_files(result)
self._clean_orphaned_multipart(result)
self._clean_stale_locks(result)
self._clean_orphaned_metadata(result)
self._clean_orphaned_versions(result)
self._clean_empty_dirs(result)
result.execution_time_seconds = time.time() - start
if result.has_work or result.errors:
logger.info(
"GC completed in %.2fs: temp=%d (%.1f MB), multipart=%d (%.1f MB), "
"locks=%d, meta=%d, versions=%d (%.1f MB), dirs=%d, errors=%d%s",
result.execution_time_seconds,
result.temp_files_deleted,
result.temp_bytes_freed / (1024 * 1024),
result.multipart_uploads_deleted,
result.multipart_bytes_freed / (1024 * 1024),
result.lock_files_deleted,
result.orphaned_metadata_deleted,
result.orphaned_versions_deleted,
result.orphaned_version_bytes_freed / (1024 * 1024),
result.empty_dirs_removed,
len(result.errors),
" (dry run)" if self.dry_run else "",
)
record = GCExecutionRecord(
timestamp=time.time(),
result=result.to_dict(),
dry_run=self.dry_run,
)
self.history_store.add(record)
return result
def _system_path(self) -> Path:
return self.storage_root / self.SYSTEM_ROOT
def _list_bucket_names(self) -> List[str]:
names = []
try:
for entry in self.storage_root.iterdir():
if entry.is_dir() and entry.name != self.SYSTEM_ROOT:
names.append(entry.name)
except OSError:
pass
return names
def _clean_temp_files(self, result: GCResult) -> None:
tmp_dir = self._system_path() / self.SYSTEM_TMP_DIR
if not tmp_dir.exists():
return
try:
for entry in tmp_dir.iterdir():
if not entry.is_file():
continue
age = _file_age_hours(entry)
if age < self.temp_file_max_age_hours:
continue
try:
size = entry.stat().st_size
if not self.dry_run:
entry.unlink()
result.temp_files_deleted += 1
result.temp_bytes_freed += size
except OSError as e:
result.errors.append(f"temp file {entry.name}: {e}")
except OSError as e:
result.errors.append(f"scan tmp dir: {e}")
def _clean_orphaned_multipart(self, result: GCResult) -> None:
cutoff_hours = self.multipart_max_age_days * 24.0
bucket_names = self._list_bucket_names()
for bucket_name in bucket_names:
for multipart_root in (
self._system_path() / self.SYSTEM_MULTIPART_DIR / bucket_name,
self.storage_root / bucket_name / ".multipart",
):
if not multipart_root.exists():
continue
try:
for upload_dir in multipart_root.iterdir():
if not upload_dir.is_dir():
continue
self._maybe_clean_upload(upload_dir, cutoff_hours, result)
except OSError as e:
result.errors.append(f"scan multipart {bucket_name}: {e}")
def _maybe_clean_upload(self, upload_dir: Path, cutoff_hours: float, result: GCResult) -> None:
manifest_path = upload_dir / "manifest.json"
age = _file_age_hours(manifest_path) if manifest_path.exists() else _file_age_hours(upload_dir)
if age < cutoff_hours:
return
dir_bytes = _dir_size(upload_dir)
try:
if not self.dry_run:
shutil.rmtree(upload_dir, ignore_errors=True)
result.multipart_uploads_deleted += 1
result.multipart_bytes_freed += dir_bytes
except OSError as e:
result.errors.append(f"multipart {upload_dir.name}: {e}")
def _clean_stale_locks(self, result: GCResult) -> None:
buckets_root = self._system_path() / self.SYSTEM_BUCKETS_DIR
if not buckets_root.exists():
return
try:
for bucket_dir in buckets_root.iterdir():
if not bucket_dir.is_dir():
continue
locks_dir = bucket_dir / "locks"
if not locks_dir.exists():
continue
try:
for lock_file in locks_dir.iterdir():
if not lock_file.is_file() or not lock_file.name.endswith(".lock"):
continue
age = _file_age_hours(lock_file)
if age < self.lock_file_max_age_hours:
continue
try:
if not self.dry_run:
lock_file.unlink(missing_ok=True)
result.lock_files_deleted += 1
except OSError as e:
result.errors.append(f"lock {lock_file.name}: {e}")
except OSError as e:
result.errors.append(f"scan locks {bucket_dir.name}: {e}")
except OSError as e:
result.errors.append(f"scan buckets for locks: {e}")
def _clean_orphaned_metadata(self, result: GCResult) -> None:
bucket_names = self._list_bucket_names()
for bucket_name in bucket_names:
legacy_meta = self.storage_root / bucket_name / ".meta"
if legacy_meta.exists():
self._clean_legacy_metadata(bucket_name, legacy_meta, result)
new_meta = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
if new_meta.exists():
self._clean_index_metadata(bucket_name, new_meta, result)
def _clean_legacy_metadata(self, bucket_name: str, meta_root: Path, result: GCResult) -> None:
bucket_path = self.storage_root / bucket_name
try:
for meta_file in meta_root.rglob("*.meta.json"):
if not meta_file.is_file():
continue
try:
rel = meta_file.relative_to(meta_root)
object_key = rel.as_posix().removesuffix(".meta.json")
object_path = bucket_path / object_key
if not object_path.exists():
if not self.dry_run:
meta_file.unlink(missing_ok=True)
result.orphaned_metadata_deleted += 1
except (OSError, ValueError) as e:
result.errors.append(f"legacy meta {bucket_name}/{meta_file.name}: {e}")
except OSError as e:
result.errors.append(f"scan legacy meta {bucket_name}: {e}")
def _clean_index_metadata(self, bucket_name: str, meta_root: Path, result: GCResult) -> None:
bucket_path = self.storage_root / bucket_name
try:
for index_file in meta_root.rglob("_index.json"):
if not index_file.is_file():
continue
try:
with open(index_file, "r", encoding="utf-8") as f:
index_data = json.load(f)
except (OSError, json.JSONDecodeError):
continue
keys_to_remove = []
for key in index_data:
rel_dir = index_file.parent.relative_to(meta_root)
if rel_dir == Path("."):
full_key = key
else:
full_key = rel_dir.as_posix() + "/" + key
object_path = bucket_path / full_key
if not object_path.exists():
keys_to_remove.append(key)
if keys_to_remove:
if not self.dry_run:
for k in keys_to_remove:
index_data.pop(k, None)
if index_data:
try:
with open(index_file, "w", encoding="utf-8") as f:
json.dump(index_data, f)
except OSError as e:
result.errors.append(f"write index {bucket_name}: {e}")
continue
else:
try:
index_file.unlink(missing_ok=True)
except OSError:
pass
result.orphaned_metadata_deleted += len(keys_to_remove)
except OSError as e:
result.errors.append(f"scan index meta {bucket_name}: {e}")
def _clean_orphaned_versions(self, result: GCResult) -> None:
bucket_names = self._list_bucket_names()
for bucket_name in bucket_names:
bucket_path = self.storage_root / bucket_name
for versions_root in (
self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_VERSIONS_DIR,
self.storage_root / bucket_name / ".versions",
):
if not versions_root.exists():
continue
try:
for key_dir in versions_root.iterdir():
if not key_dir.is_dir():
continue
self._clean_versions_for_key(bucket_path, versions_root, key_dir, result)
except OSError as e:
result.errors.append(f"scan versions {bucket_name}: {e}")
def _clean_versions_for_key(
self, bucket_path: Path, versions_root: Path, key_dir: Path, result: GCResult
) -> None:
try:
rel = key_dir.relative_to(versions_root)
except ValueError:
return
object_path = bucket_path / rel
if object_path.exists():
return
version_files = list(key_dir.glob("*.bin")) + list(key_dir.glob("*.json"))
if not version_files:
return
for vf in version_files:
try:
size = vf.stat().st_size if vf.suffix == ".bin" else 0
if not self.dry_run:
vf.unlink(missing_ok=True)
if vf.suffix == ".bin":
result.orphaned_version_bytes_freed += size
result.orphaned_versions_deleted += 1
except OSError as e:
result.errors.append(f"version file {vf.name}: {e}")
def _clean_empty_dirs(self, result: GCResult) -> None:
targets = [
self._system_path() / self.SYSTEM_TMP_DIR,
self._system_path() / self.SYSTEM_MULTIPART_DIR,
self._system_path() / self.SYSTEM_BUCKETS_DIR,
]
for bucket_name in self._list_bucket_names():
targets.append(self.storage_root / bucket_name / ".meta")
targets.append(self.storage_root / bucket_name / ".versions")
targets.append(self.storage_root / bucket_name / ".multipart")
for root in targets:
if not root.exists():
continue
self._remove_empty_dirs_recursive(root, root, result)
def _remove_empty_dirs_recursive(self, path: Path, stop_at: Path, result: GCResult) -> bool:
if not path.is_dir():
return False
try:
children = list(path.iterdir())
except OSError:
return False
all_empty = True
for child in children:
if child.is_dir():
if not self._remove_empty_dirs_recursive(child, stop_at, result):
all_empty = False
else:
all_empty = False
if all_empty and path != stop_at:
try:
if not self.dry_run:
path.rmdir()
result.empty_dirs_removed += 1
return True
except OSError:
return False
return all_empty
def get_history(self, limit: int = 50, offset: int = 0) -> List[dict]:
records = self.history_store.get_history(limit, offset)
return [r.to_dict() for r in records]
def get_status(self) -> dict:
return {
"enabled": not self._shutdown or self._timer is not None,
"running": self._timer is not None and not self._shutdown,
"interval_hours": self.interval_seconds / 3600.0,
"temp_file_max_age_hours": self.temp_file_max_age_hours,
"multipart_max_age_days": self.multipart_max_age_days,
"lock_file_max_age_hours": self.lock_file_max_age_hours,
"dry_run": self.dry_run,
}

54
docs.md
View File

@@ -252,6 +252,60 @@ Once enabled, configure lifecycle rules via:
</LifecycleConfiguration>
```
## Garbage Collection
The garbage collector (GC) automatically cleans up orphaned data that accumulates over time: stale temporary files from failed uploads, abandoned multipart uploads, stale lock files, orphaned metadata entries, orphaned version files, and empty directories.
### Enabling GC
By default, GC is disabled. Enable it by setting:
```bash
GC_ENABLED=true python run.py
```
Or in your `myfsio.env` file:
```
GC_ENABLED=true
GC_INTERVAL_HOURS=6 # Run every 6 hours (default)
GC_TEMP_FILE_MAX_AGE_HOURS=24 # Delete temp files older than 24h
GC_MULTIPART_MAX_AGE_DAYS=7 # Delete orphaned multipart uploads older than 7 days
GC_LOCK_FILE_MAX_AGE_HOURS=1 # Delete stale lock files older than 1h
GC_DRY_RUN=false # Set to true to log without deleting
```
### What Gets Cleaned
| Type | Location | Condition |
|------|----------|-----------|
| **Temp files** | `.myfsio.sys/tmp/` | Older than `GC_TEMP_FILE_MAX_AGE_HOURS` |
| **Orphaned multipart uploads** | `.myfsio.sys/multipart/` and `<bucket>/.multipart/` | Older than `GC_MULTIPART_MAX_AGE_DAYS` |
| **Stale lock files** | `.myfsio.sys/buckets/<bucket>/locks/` | Older than `GC_LOCK_FILE_MAX_AGE_HOURS` |
| **Orphaned metadata** | `.myfsio.sys/buckets/<bucket>/meta/` and `<bucket>/.meta/` | Object file no longer exists |
| **Orphaned versions** | `.myfsio.sys/buckets/<bucket>/versions/` and `<bucket>/.versions/` | Main object no longer exists |
| **Empty directories** | Various internal directories | Directory is empty after cleanup |
### Admin API
All GC endpoints require admin (`iam:*`) permissions.
| Method | Route | Description |
|--------|-------|-------------|
| `GET` | `/admin/gc/status` | Get GC status and configuration |
| `POST` | `/admin/gc/run` | Trigger a manual GC run (body: `{"dry_run": true}` for preview) |
| `GET` | `/admin/gc/history` | Get GC execution history (query: `?limit=50&offset=0`) |
### Dry Run Mode
Set `GC_DRY_RUN=true` to log what would be deleted without actually removing anything. You can also trigger a one-time dry run via the admin API:
```bash
curl -X POST "http://localhost:5000/admin/gc/run" \
-H "X-Access-Key: <key>" -H "X-Secret-Key: <secret>" \
-H "Content-Type: application/json" \
-d '{"dry_run": true}'
```
### Performance Tuning
| Variable | Default | Notes |

View File

@@ -40,6 +40,7 @@
<li><a href="#quotas">Bucket Quotas</a></li>
<li><a href="#encryption">Encryption</a></li>
<li><a href="#lifecycle">Lifecycle Rules</a></li>
<li><a href="#garbage-collection">Garbage Collection</a></li>
<li><a href="#metrics">Metrics History</a></li>
<li><a href="#operation-metrics">Operation Metrics</a></li>
<li><a href="#troubleshooting">Troubleshooting</a></li>
@@ -1627,10 +1628,113 @@ curl "{{ api_base }}/&lt;bucket&gt;?lifecycle" \
</div>
</div>
</article>
<article id="metrics" class="card shadow-sm docs-section">
<article id="garbage-collection" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">14</span>
<h2 class="h4 mb-0">Garbage Collection</h2>
</div>
<p class="text-muted">Automatically clean up orphaned data that accumulates over time: stale temp files, abandoned multipart uploads, stale lock files, orphaned metadata, orphaned versions, and empty directories.</p>
<h3 class="h6 text-uppercase text-muted mt-4">Enabling GC</h3>
<p class="small text-muted">Disabled by default. Enable via environment variable:</p>
<pre class="mb-3"><code class="language-bash">GC_ENABLED=true python run.py</code></pre>
<h3 class="h6 text-uppercase text-muted mt-4">Configuration</h3>
<div class="table-responsive mb-3">
<table class="table table-sm table-bordered small">
<thead class="table-light">
<tr>
<th>Variable</th>
<th>Default</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr><td><code>GC_ENABLED</code></td><td><code>false</code></td><td>Enable garbage collection</td></tr>
<tr><td><code>GC_INTERVAL_HOURS</code></td><td><code>6</code></td><td>Hours between GC cycles</td></tr>
<tr><td><code>GC_TEMP_FILE_MAX_AGE_HOURS</code></td><td><code>24</code></td><td>Delete temp files older than this</td></tr>
<tr><td><code>GC_MULTIPART_MAX_AGE_DAYS</code></td><td><code>7</code></td><td>Delete orphaned multipart uploads older than this</td></tr>
<tr><td><code>GC_LOCK_FILE_MAX_AGE_HOURS</code></td><td><code>1</code></td><td>Delete stale lock files older than this</td></tr>
<tr><td><code>GC_DRY_RUN</code></td><td><code>false</code></td><td>Log what would be deleted without removing</td></tr>
</tbody>
</table>
</div>
<h3 class="h6 text-uppercase text-muted mt-4">What Gets Cleaned</h3>
<div class="table-responsive mb-3">
<table class="table table-sm table-bordered small">
<thead class="table-light">
<tr>
<th>Type</th>
<th>Location</th>
<th>Condition</th>
</tr>
</thead>
<tbody>
<tr><td><strong>Temp files</strong></td><td><code>.myfsio.sys/tmp/</code></td><td>Older than configured max age</td></tr>
<tr><td><strong>Orphaned multipart</strong></td><td><code>.myfsio.sys/multipart/</code></td><td>Older than configured max age</td></tr>
<tr><td><strong>Stale lock files</strong></td><td><code>.myfsio.sys/buckets/&lt;bucket&gt;/locks/</code></td><td>Older than configured max age</td></tr>
<tr><td><strong>Orphaned metadata</strong></td><td><code>.myfsio.sys/buckets/&lt;bucket&gt;/meta/</code></td><td>Object file no longer exists</td></tr>
<tr><td><strong>Orphaned versions</strong></td><td><code>.myfsio.sys/buckets/&lt;bucket&gt;/versions/</code></td><td>Main object no longer exists</td></tr>
<tr><td><strong>Empty directories</strong></td><td>Various internal dirs</td><td>Directory is empty after cleanup</td></tr>
</tbody>
</table>
</div>
<h3 class="h6 text-uppercase text-muted mt-4">Admin API</h3>
<div class="table-responsive mb-3">
<table class="table table-sm table-bordered small">
<thead class="table-light">
<tr>
<th>Method</th>
<th>Route</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr><td><code>GET</code></td><td><code>/admin/gc/status</code></td><td>Get GC status and configuration</td></tr>
<tr><td><code>POST</code></td><td><code>/admin/gc/run</code></td><td>Trigger manual GC run</td></tr>
<tr><td><code>GET</code></td><td><code>/admin/gc/history</code></td><td>Get execution history</td></tr>
</tbody>
</table>
</div>
<pre class="mb-3"><code class="language-bash"># Trigger a dry run (preview what would be cleaned)
curl -X POST "{{ api_base }}/admin/gc/run" \
-H "X-Access-Key: &lt;key&gt;" -H "X-Secret-Key: &lt;secret&gt;" \
-H "Content-Type: application/json" \
-d '{"dry_run": true}'
# Trigger actual GC
curl -X POST "{{ api_base }}/admin/gc/run" \
-H "X-Access-Key: &lt;key&gt;" -H "X-Secret-Key: &lt;secret&gt;"
# Check status
curl "{{ api_base }}/admin/gc/status" \
-H "X-Access-Key: &lt;key&gt;" -H "X-Secret-Key: &lt;secret&gt;"
# View history
curl "{{ api_base }}/admin/gc/history?limit=10" \
-H "X-Access-Key: &lt;key&gt;" -H "X-Secret-Key: &lt;secret&gt;"</code></pre>
<div class="alert alert-light border mb-0">
<div class="d-flex gap-2">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-info-circle text-muted mt-1 flex-shrink-0" viewBox="0 0 16 16">
<path d="M8 15A7 7 0 1 1 8 1a7 7 0 0 1 0 14zm0 1A8 8 0 1 0 8 0a8 8 0 0 0 0 16z"/>
<path d="m8.93 6.588-2.29.287-.082.38.45.083c.294.07.352.176.288.469l-.738 3.468c-.194.897.105 1.319.808 1.319.545 0 1.178-.252 1.465-.598l.088-.416c-.2.176-.492.246-.686.246-.275 0-.375-.193-.304-.533L8.93 6.588zM9 4.5a1 1 0 1 1-2 0 1 1 0 0 1 2 0z"/>
</svg>
<div>
<strong>Dry Run:</strong> Use <code>GC_DRY_RUN=true</code> or pass <code>{"dry_run": true}</code> to the API to preview what would be deleted without actually removing anything. Check the logs or API response for details.
</div>
</div>
</div>
</div>
</article>
<article id="metrics" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">15</span>
<h2 class="h4 mb-0">Metrics History</h2>
</div>
<p class="text-muted">Track CPU, memory, and disk usage over time with optional metrics history. Disabled by default to minimize overhead.</p>
@@ -1714,7 +1818,7 @@ curl -X PUT "{{ api_base | replace('/api', '/ui') }}/metrics/settings" \
<article id="operation-metrics" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">15</span>
<span class="docs-section-kicker">16</span>
<h2 class="h4 mb-0">Operation Metrics</h2>
</div>
<p class="text-muted">Track API request statistics including request counts, latency, error rates, and bandwidth usage. Provides real-time visibility into API operations.</p>
@@ -1821,7 +1925,7 @@ curl "{{ api_base | replace('/api', '/ui') }}/metrics/operations/history?hours=6
<article id="troubleshooting" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">16</span>
<span class="docs-section-kicker">17</span>
<h2 class="h4 mb-0">Troubleshooting &amp; tips</h2>
</div>
<div class="table-responsive">
@@ -1872,7 +1976,7 @@ curl "{{ api_base | replace('/api', '/ui') }}/metrics/operations/history?hours=6
<article id="health-check" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">17</span>
<span class="docs-section-kicker">18</span>
<h2 class="h4 mb-0">Health Check Endpoint</h2>
</div>
<p class="text-muted">The API exposes a health check endpoint for monitoring and load balancer integration.</p>
@@ -1894,7 +1998,7 @@ curl {{ api_base }}/myfsio/health
<article id="object-lock" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">18</span>
<span class="docs-section-kicker">19</span>
<h2 class="h4 mb-0">Object Lock &amp; Retention</h2>
</div>
<p class="text-muted">Object Lock prevents objects from being deleted or overwritten for a specified retention period.</p>
@@ -1954,7 +2058,7 @@ curl "{{ api_base }}/&lt;bucket&gt;/&lt;key&gt;?legal-hold" \
<article id="access-logging" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">19</span>
<span class="docs-section-kicker">20</span>
<h2 class="h4 mb-0">Access Logging</h2>
</div>
<p class="text-muted">Enable S3-style access logging to track all requests to your buckets for audit and analysis.</p>
@@ -1981,7 +2085,7 @@ curl "{{ api_base }}/&lt;bucket&gt;?logging" \
<article id="notifications" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">20</span>
<span class="docs-section-kicker">21</span>
<h2 class="h4 mb-0">Notifications &amp; Webhooks</h2>
</div>
<p class="text-muted">Configure event notifications to trigger webhooks when objects are created or deleted.</p>
@@ -2044,7 +2148,7 @@ curl -X PUT "{{ api_base }}/&lt;bucket&gt;?notification" \
<article id="select-content" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">21</span>
<span class="docs-section-kicker">22</span>
<h2 class="h4 mb-0">SelectObjectContent (SQL)</h2>
</div>
<p class="text-muted">Query CSV, JSON, or Parquet files directly using SQL without downloading the entire object.</p>
@@ -2089,7 +2193,7 @@ curl -X POST "{{ api_base }}/&lt;bucket&gt;/data.csv?select" \
<article id="advanced-ops" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">22</span>
<span class="docs-section-kicker">23</span>
<h2 class="h4 mb-0">Advanced S3 Operations</h2>
</div>
<p class="text-muted">Copy, move, and partially download objects using advanced S3 operations.</p>
@@ -2163,7 +2267,7 @@ curl "{{ api_base }}/&lt;bucket&gt;/&lt;key&gt;" \
<article id="acls" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">23</span>
<span class="docs-section-kicker">24</span>
<h2 class="h4 mb-0">Access Control Lists (ACLs)</h2>
</div>
<p class="text-muted">ACLs provide legacy-style permission management for buckets and objects.</p>
@@ -2217,7 +2321,7 @@ curl -X PUT "{{ api_base }}/&lt;bucket&gt;/&lt;key&gt;" \
<article id="tagging" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">24</span>
<span class="docs-section-kicker">25</span>
<h2 class="h4 mb-0">Object &amp; Bucket Tagging</h2>
</div>
<p class="text-muted">Add metadata tags to buckets and objects for organization, cost allocation, or lifecycle rule filtering.</p>
@@ -2276,7 +2380,7 @@ curl -X PUT "{{ api_base }}/&lt;bucket&gt;?tagging" \
<article id="website-hosting" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">25</span>
<span class="docs-section-kicker">26</span>
<h2 class="h4 mb-0">Static Website Hosting</h2>
</div>
<p class="text-muted">Host static websites directly from S3 buckets with custom index and error pages, served via custom domain mapping.</p>
@@ -2369,7 +2473,7 @@ server {
<article id="cors-config" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">26</span>
<span class="docs-section-kicker">27</span>
<h2 class="h4 mb-0">CORS Configuration</h2>
</div>
<p class="text-muted">Configure per-bucket Cross-Origin Resource Sharing rules to control which origins can access your bucket from a browser.</p>
@@ -2436,7 +2540,7 @@ curl -X DELETE "{{ api_base }}/&lt;bucket&gt;?cors" \
<article id="post-object" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">27</span>
<span class="docs-section-kicker">28</span>
<h2 class="h4 mb-0">PostObject (HTML Form Upload)</h2>
</div>
<p class="text-muted">Upload objects directly from an HTML form using browser-based POST uploads with policy-based authorization.</p>
@@ -2478,7 +2582,7 @@ curl -X DELETE "{{ api_base }}/&lt;bucket&gt;?cors" \
<article id="list-objects-v2" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">28</span>
<span class="docs-section-kicker">29</span>
<h2 class="h4 mb-0">List Objects API v2</h2>
</div>
<p class="text-muted">Use the v2 list API for improved pagination with continuation tokens instead of markers.</p>
@@ -2522,7 +2626,7 @@ curl "{{ api_base }}/&lt;bucket&gt;?list-type=2&amp;start-after=photos/2025/" \
<article id="upgrading" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">29</span>
<span class="docs-section-kicker">30</span>
<h2 class="h4 mb-0">Upgrading &amp; Updates</h2>
</div>
<p class="text-muted">How to safely update MyFSIO to a new version.</p>
@@ -2555,7 +2659,7 @@ cp -r logs/ logs-backup/</code></pre>
<article id="api-matrix" class="card shadow-sm docs-section">
<div class="card-body">
<div class="d-flex align-items-center gap-2 mb-3">
<span class="docs-section-kicker">30</span>
<span class="docs-section-kicker">31</span>
<h2 class="h4 mb-0">Full API Reference</h2>
</div>
<p class="text-muted">Complete list of all S3-compatible, admin, and KMS endpoints.</p>
@@ -2653,6 +2757,7 @@ POST /kms/generate-random # Generate random bytes</code></pre>
<li><a href="#quotas">Bucket Quotas</a></li>
<li><a href="#encryption">Encryption</a></li>
<li><a href="#lifecycle">Lifecycle Rules</a></li>
<li><a href="#garbage-collection">Garbage Collection</a></li>
<li><a href="#metrics">Metrics History</a></li>
<li><a href="#operation-metrics">Operation Metrics</a></li>
<li><a href="#troubleshooting">Troubleshooting</a></li>

350
tests/test_gc.py Normal file
View File

@@ -0,0 +1,350 @@
import json
import os
import time
from pathlib import Path
import pytest
from app.gc import GarbageCollector, GCResult
@pytest.fixture
def storage_root(tmp_path):
root = tmp_path / "data"
root.mkdir()
sys_root = root / ".myfsio.sys"
sys_root.mkdir()
(sys_root / "config").mkdir(parents=True)
(sys_root / "tmp").mkdir()
(sys_root / "multipart").mkdir()
(sys_root / "buckets").mkdir()
return root
@pytest.fixture
def gc(storage_root):
return GarbageCollector(
storage_root=storage_root,
interval_hours=1.0,
temp_file_max_age_hours=1.0,
multipart_max_age_days=1,
lock_file_max_age_hours=0.5,
dry_run=False,
)
def _make_old(path, hours=48):
old_time = time.time() - hours * 3600
os.utime(path, (old_time, old_time))
class TestTempFileCleanup:
def test_old_temp_files_deleted(self, storage_root, gc):
tmp_dir = storage_root / ".myfsio.sys" / "tmp"
old_file = tmp_dir / "abc123.tmp"
old_file.write_bytes(b"x" * 1000)
_make_old(old_file, hours=48)
result = gc.run_now()
assert result.temp_files_deleted == 1
assert result.temp_bytes_freed == 1000
assert not old_file.exists()
def test_recent_temp_files_kept(self, storage_root, gc):
tmp_dir = storage_root / ".myfsio.sys" / "tmp"
new_file = tmp_dir / "recent.tmp"
new_file.write_bytes(b"data")
result = gc.run_now()
assert result.temp_files_deleted == 0
assert new_file.exists()
def test_dry_run_keeps_files(self, storage_root, gc):
gc.dry_run = True
tmp_dir = storage_root / ".myfsio.sys" / "tmp"
old_file = tmp_dir / "stale.tmp"
old_file.write_bytes(b"x" * 500)
_make_old(old_file, hours=48)
result = gc.run_now()
assert result.temp_files_deleted == 1
assert result.temp_bytes_freed == 500
assert old_file.exists()
class TestMultipartCleanup:
def test_old_orphaned_multipart_deleted(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
mp_root = storage_root / ".myfsio.sys" / "multipart" / "test-bucket"
mp_root.mkdir(parents=True)
upload_dir = mp_root / "upload-123"
upload_dir.mkdir()
manifest = upload_dir / "manifest.json"
manifest.write_text(json.dumps({"upload_id": "upload-123", "object_key": "foo.txt"}))
part = upload_dir / "part-00001.part"
part.write_bytes(b"x" * 2000)
_make_old(manifest, hours=200)
_make_old(part, hours=200)
_make_old(upload_dir, hours=200)
result = gc.run_now()
assert result.multipart_uploads_deleted == 1
assert result.multipart_bytes_freed > 0
assert not upload_dir.exists()
def test_recent_multipart_kept(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
mp_root = storage_root / ".myfsio.sys" / "multipart" / "test-bucket"
mp_root.mkdir(parents=True)
upload_dir = mp_root / "upload-new"
upload_dir.mkdir()
manifest = upload_dir / "manifest.json"
manifest.write_text(json.dumps({"upload_id": "upload-new", "object_key": "bar.txt"}))
result = gc.run_now()
assert result.multipart_uploads_deleted == 0
assert upload_dir.exists()
def test_legacy_multipart_cleaned(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
legacy_mp = bucket / ".multipart" / "upload-old"
legacy_mp.mkdir(parents=True)
part = legacy_mp / "part-00001.part"
part.write_bytes(b"y" * 500)
_make_old(part, hours=200)
_make_old(legacy_mp, hours=200)
result = gc.run_now()
assert result.multipart_uploads_deleted == 1
class TestLockFileCleanup:
def test_stale_lock_files_deleted(self, storage_root, gc):
locks_dir = storage_root / ".myfsio.sys" / "buckets" / "test-bucket" / "locks"
locks_dir.mkdir(parents=True)
lock = locks_dir / "some_key.lock"
lock.write_text("")
_make_old(lock, hours=2)
result = gc.run_now()
assert result.lock_files_deleted == 1
assert not lock.exists()
def test_recent_lock_kept(self, storage_root, gc):
locks_dir = storage_root / ".myfsio.sys" / "buckets" / "test-bucket" / "locks"
locks_dir.mkdir(parents=True)
lock = locks_dir / "active.lock"
lock.write_text("")
result = gc.run_now()
assert result.lock_files_deleted == 0
assert lock.exists()
class TestOrphanedMetadataCleanup:
def test_legacy_orphaned_metadata_deleted(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
meta_dir = bucket / ".meta"
meta_dir.mkdir()
orphan = meta_dir / "deleted_file.txt.meta.json"
orphan.write_text(json.dumps({"etag": "abc"}))
result = gc.run_now()
assert result.orphaned_metadata_deleted == 1
assert not orphan.exists()
def test_valid_metadata_kept(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
obj = bucket / "exists.txt"
obj.write_text("hello")
meta_dir = bucket / ".meta"
meta_dir.mkdir()
meta = meta_dir / "exists.txt.meta.json"
meta.write_text(json.dumps({"etag": "abc"}))
result = gc.run_now()
assert result.orphaned_metadata_deleted == 0
assert meta.exists()
def test_index_orphaned_entries_cleaned(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
obj = bucket / "keep.txt"
obj.write_text("hello")
meta_dir = storage_root / ".myfsio.sys" / "buckets" / "test-bucket" / "meta"
meta_dir.mkdir(parents=True)
index = meta_dir / "_index.json"
index.write_text(json.dumps({"keep.txt": {"etag": "a"}, "gone.txt": {"etag": "b"}}))
result = gc.run_now()
assert result.orphaned_metadata_deleted == 1
updated = json.loads(index.read_text())
assert "keep.txt" in updated
assert "gone.txt" not in updated
class TestOrphanedVersionsCleanup:
def test_orphaned_versions_deleted(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
versions_dir = storage_root / ".myfsio.sys" / "buckets" / "test-bucket" / "versions" / "deleted_obj.txt"
versions_dir.mkdir(parents=True)
v_bin = versions_dir / "v1.bin"
v_json = versions_dir / "v1.json"
v_bin.write_bytes(b"old data" * 100)
v_json.write_text(json.dumps({"version_id": "v1", "size": 800}))
result = gc.run_now()
assert result.orphaned_versions_deleted == 2
assert result.orphaned_version_bytes_freed == 800
def test_active_versions_kept(self, storage_root, gc):
bucket = storage_root / "test-bucket"
bucket.mkdir()
obj = bucket / "active.txt"
obj.write_text("current")
versions_dir = storage_root / ".myfsio.sys" / "buckets" / "test-bucket" / "versions" / "active.txt"
versions_dir.mkdir(parents=True)
v_bin = versions_dir / "v1.bin"
v_bin.write_bytes(b"old version")
result = gc.run_now()
assert result.orphaned_versions_deleted == 0
assert v_bin.exists()
class TestEmptyDirCleanup:
def test_empty_dirs_removed(self, storage_root, gc):
empty = storage_root / ".myfsio.sys" / "buckets" / "test-bucket" / "locks" / "sub"
empty.mkdir(parents=True)
result = gc.run_now()
assert result.empty_dirs_removed > 0
assert not empty.exists()
class TestHistory:
def test_history_recorded(self, storage_root, gc):
gc.run_now()
history = gc.get_history()
assert len(history) == 1
assert "result" in history[0]
assert "timestamp" in history[0]
def test_multiple_runs(self, storage_root, gc):
gc.run_now()
gc.run_now()
gc.run_now()
history = gc.get_history()
assert len(history) == 3
assert history[0]["timestamp"] >= history[1]["timestamp"]
class TestStatus:
def test_get_status(self, storage_root, gc):
status = gc.get_status()
assert status["interval_hours"] == 1.0
assert status["dry_run"] is False
assert status["temp_file_max_age_hours"] == 1.0
assert status["multipart_max_age_days"] == 1
assert status["lock_file_max_age_hours"] == 0.5
class TestGCResult:
def test_total_bytes_freed(self):
r = GCResult(temp_bytes_freed=100, multipart_bytes_freed=200, orphaned_version_bytes_freed=300)
assert r.total_bytes_freed == 600
def test_has_work(self):
assert not GCResult().has_work
assert GCResult(temp_files_deleted=1).has_work
assert GCResult(lock_files_deleted=1).has_work
assert GCResult(empty_dirs_removed=1).has_work
class TestAdminAPI:
@pytest.fixture
def gc_app(self, tmp_path):
from app import create_api_app
storage_root = tmp_path / "data"
iam_config = tmp_path / "iam.json"
bucket_policies = tmp_path / "bucket_policies.json"
iam_payload = {
"users": [
{
"access_key": "admin",
"secret_key": "adminsecret",
"display_name": "Admin",
"policies": [{"bucket": "*", "actions": ["list", "read", "write", "delete", "policy", "iam:*"]}],
}
]
}
iam_config.write_text(json.dumps(iam_payload))
flask_app = create_api_app({
"TESTING": True,
"SECRET_KEY": "testing",
"STORAGE_ROOT": storage_root,
"IAM_CONFIG": iam_config,
"BUCKET_POLICY_PATH": bucket_policies,
"GC_ENABLED": True,
"GC_INTERVAL_HOURS": 1.0,
})
yield flask_app
gc = flask_app.extensions.get("gc")
if gc:
gc.stop()
def test_gc_status(self, gc_app):
client = gc_app.test_client()
resp = client.get("/admin/gc/status", headers={"X-Access-Key": "admin", "X-Secret-Key": "adminsecret"})
assert resp.status_code == 200
data = resp.get_json()
assert data["enabled"] is True
def test_gc_run(self, gc_app):
client = gc_app.test_client()
resp = client.post(
"/admin/gc/run",
headers={"X-Access-Key": "admin", "X-Secret-Key": "adminsecret"},
content_type="application/json",
)
assert resp.status_code == 200
data = resp.get_json()
assert "temp_files_deleted" in data
def test_gc_dry_run(self, gc_app):
client = gc_app.test_client()
resp = client.post(
"/admin/gc/run",
headers={"X-Access-Key": "admin", "X-Secret-Key": "adminsecret"},
data=json.dumps({"dry_run": True}),
content_type="application/json",
)
assert resp.status_code == 200
data = resp.get_json()
assert "temp_files_deleted" in data
def test_gc_history(self, gc_app):
client = gc_app.test_client()
client.post("/admin/gc/run", headers={"X-Access-Key": "admin", "X-Secret-Key": "adminsecret"})
resp = client.get("/admin/gc/history", headers={"X-Access-Key": "admin", "X-Secret-Key": "adminsecret"})
assert resp.status_code == 200
data = resp.get_json()
assert len(data["executions"]) >= 1
def test_gc_requires_admin(self, gc_app):
iam = gc_app.extensions["iam"]
user = iam.create_user(display_name="Regular")
client = gc_app.test_client()
resp = client.get(
"/admin/gc/status",
headers={"X-Access-Key": user["access_key"], "X-Secret-Key": user["secret_key"]},
)
assert resp.status_code == 403