Add replication failure tracking and lifecycle execution history

This commit is contained in:
2026-01-05 00:18:08 +08:00
parent 9ab750650c
commit 93a5aa6618
6 changed files with 720 additions and 9 deletions

View File

@@ -124,7 +124,7 @@ def create_app(
)
connections = ConnectionStore(connections_path)
replication = ReplicationManager(storage, connections, replication_rules_path)
replication = ReplicationManager(storage, connections, replication_rules_path, storage_root)
encryption_config = {
"encryption_enabled": app.config.get("ENCRYPTION_ENABLED", False),
@@ -156,6 +156,7 @@ def create_app(
lifecycle_manager = LifecycleManager(
base_storage,
interval_seconds=app.config.get("LIFECYCLE_INTERVAL_SECONDS", 3600),
storage_root=storage_root,
)
lifecycle_manager.start()

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import json
import logging
import threading
import time
@@ -23,13 +24,104 @@ class LifecycleResult:
execution_time_seconds: float = 0.0
@dataclass
class LifecycleExecutionRecord:
timestamp: float
bucket_name: str
objects_deleted: int
versions_deleted: int
uploads_aborted: int
errors: List[str]
execution_time_seconds: float
def to_dict(self) -> dict:
return {
"timestamp": self.timestamp,
"bucket_name": self.bucket_name,
"objects_deleted": self.objects_deleted,
"versions_deleted": self.versions_deleted,
"uploads_aborted": self.uploads_aborted,
"errors": self.errors,
"execution_time_seconds": self.execution_time_seconds,
}
@classmethod
def from_dict(cls, data: dict) -> "LifecycleExecutionRecord":
return cls(
timestamp=data["timestamp"],
bucket_name=data["bucket_name"],
objects_deleted=data["objects_deleted"],
versions_deleted=data["versions_deleted"],
uploads_aborted=data["uploads_aborted"],
errors=data.get("errors", []),
execution_time_seconds=data["execution_time_seconds"],
)
@classmethod
def from_result(cls, result: LifecycleResult) -> "LifecycleExecutionRecord":
return cls(
timestamp=time.time(),
bucket_name=result.bucket_name,
objects_deleted=result.objects_deleted,
versions_deleted=result.versions_deleted,
uploads_aborted=result.uploads_aborted,
errors=result.errors.copy(),
execution_time_seconds=result.execution_time_seconds,
)
class LifecycleHistoryStore:
MAX_HISTORY_PER_BUCKET = 50
def __init__(self, storage_root: Path) -> None:
self.storage_root = storage_root
self._lock = threading.Lock()
def _get_history_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "lifecycle_history.json"
def load_history(self, bucket_name: str) -> List[LifecycleExecutionRecord]:
path = self._get_history_path(bucket_name)
if not path.exists():
return []
try:
with open(path, "r") as f:
data = json.load(f)
return [LifecycleExecutionRecord.from_dict(d) for d in data.get("executions", [])]
except (OSError, ValueError, KeyError) as e:
logger.error(f"Failed to load lifecycle history for {bucket_name}: {e}")
return []
def save_history(self, bucket_name: str, records: List[LifecycleExecutionRecord]) -> None:
path = self._get_history_path(bucket_name)
path.parent.mkdir(parents=True, exist_ok=True)
data = {"executions": [r.to_dict() for r in records[:self.MAX_HISTORY_PER_BUCKET]]}
try:
with open(path, "w") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error(f"Failed to save lifecycle history for {bucket_name}: {e}")
def add_record(self, bucket_name: str, record: LifecycleExecutionRecord) -> None:
with self._lock:
records = self.load_history(bucket_name)
records.insert(0, record)
self.save_history(bucket_name, records)
def get_history(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[LifecycleExecutionRecord]:
records = self.load_history(bucket_name)
return records[offset:offset + limit]
class LifecycleManager:
def __init__(self, storage: ObjectStorage, interval_seconds: int = 3600):
def __init__(self, storage: ObjectStorage, interval_seconds: int = 3600, storage_root: Optional[Path] = None):
self.storage = storage
self.interval_seconds = interval_seconds
self.storage_root = storage_root
self._timer: Optional[threading.Timer] = None
self._shutdown = False
self._lock = threading.Lock()
self.history_store = LifecycleHistoryStore(storage_root) if storage_root else None
def start(self) -> None:
if self._timer is not None:
@@ -98,12 +190,15 @@ class LifecycleManager:
logger.error(f"Lifecycle enforcement error for {bucket_name}: {e}")
result.execution_time_seconds = time.time() - start_time
if result.objects_deleted > 0 or result.versions_deleted > 0 or result.uploads_aborted > 0:
if result.objects_deleted > 0 or result.versions_deleted > 0 or result.uploads_aborted > 0 or result.errors:
logger.info(
f"Lifecycle enforcement for {bucket_name}: "
f"deleted={result.objects_deleted}, versions={result.versions_deleted}, "
f"aborted={result.uploads_aborted}, time={result.execution_time_seconds:.2f}s"
)
if self.history_store:
record = LifecycleExecutionRecord.from_result(result)
self.history_store.add_record(bucket_name, record)
return result
def _enforce_expiration(
@@ -233,3 +328,8 @@ class LifecycleManager:
if bucket_name:
return {bucket_name: self.enforce_rules(bucket_name)}
return self.enforce_all_buckets()
def get_execution_history(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[LifecycleExecutionRecord]:
if not self.history_store:
return []
return self.history_store.get_history(bucket_name, limit, offset)

View File

@@ -8,7 +8,7 @@ import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
import boto3
from botocore.config import Config
@@ -87,6 +87,40 @@ class ReplicationStats:
)
@dataclass
class ReplicationFailure:
object_key: str
error_message: str
timestamp: float
failure_count: int
bucket_name: str
action: str
last_error_code: Optional[str] = None
def to_dict(self) -> dict:
return {
"object_key": self.object_key,
"error_message": self.error_message,
"timestamp": self.timestamp,
"failure_count": self.failure_count,
"bucket_name": self.bucket_name,
"action": self.action,
"last_error_code": self.last_error_code,
}
@classmethod
def from_dict(cls, data: dict) -> "ReplicationFailure":
return cls(
object_key=data["object_key"],
error_message=data["error_message"],
timestamp=data["timestamp"],
failure_count=data["failure_count"],
bucket_name=data["bucket_name"],
action=data["action"],
last_error_code=data.get("last_error_code"),
)
@dataclass
class ReplicationRule:
bucket_name: str
@@ -120,15 +154,86 @@ class ReplicationRule:
return rule
class ReplicationFailureStore:
MAX_FAILURES_PER_BUCKET = 50
def __init__(self, storage_root: Path) -> None:
self.storage_root = storage_root
self._lock = threading.Lock()
def _get_failures_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "replication_failures.json"
def load_failures(self, bucket_name: str) -> List[ReplicationFailure]:
path = self._get_failures_path(bucket_name)
if not path.exists():
return []
try:
with open(path, "r") as f:
data = json.load(f)
return [ReplicationFailure.from_dict(d) for d in data.get("failures", [])]
except (OSError, ValueError, KeyError) as e:
logger.error(f"Failed to load replication failures for {bucket_name}: {e}")
return []
def save_failures(self, bucket_name: str, failures: List[ReplicationFailure]) -> None:
path = self._get_failures_path(bucket_name)
path.parent.mkdir(parents=True, exist_ok=True)
data = {"failures": [f.to_dict() for f in failures[:self.MAX_FAILURES_PER_BUCKET]]}
try:
with open(path, "w") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error(f"Failed to save replication failures for {bucket_name}: {e}")
def add_failure(self, bucket_name: str, failure: ReplicationFailure) -> None:
with self._lock:
failures = self.load_failures(bucket_name)
existing = next((f for f in failures if f.object_key == failure.object_key), None)
if existing:
existing.failure_count += 1
existing.timestamp = failure.timestamp
existing.error_message = failure.error_message
existing.last_error_code = failure.last_error_code
else:
failures.insert(0, failure)
self.save_failures(bucket_name, failures)
def remove_failure(self, bucket_name: str, object_key: str) -> bool:
with self._lock:
failures = self.load_failures(bucket_name)
original_len = len(failures)
failures = [f for f in failures if f.object_key != object_key]
if len(failures) < original_len:
self.save_failures(bucket_name, failures)
return True
return False
def clear_failures(self, bucket_name: str) -> None:
with self._lock:
path = self._get_failures_path(bucket_name)
if path.exists():
path.unlink()
def get_failure(self, bucket_name: str, object_key: str) -> Optional[ReplicationFailure]:
failures = self.load_failures(bucket_name)
return next((f for f in failures if f.object_key == object_key), None)
def get_failure_count(self, bucket_name: str) -> int:
return len(self.load_failures(bucket_name))
class ReplicationManager:
def __init__(self, storage: ObjectStorage, connections: ConnectionStore, rules_path: Path) -> None:
def __init__(self, storage: ObjectStorage, connections: ConnectionStore, rules_path: Path, storage_root: Path) -> None:
self.storage = storage
self.connections = connections
self.rules_path = rules_path
self.storage_root = storage_root
self._rules: Dict[str, ReplicationRule] = {}
self._stats_lock = threading.Lock()
self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ReplicationWorker")
self._shutdown = False
self.failure_store = ReplicationFailureStore(storage_root)
self.reload_rules()
def shutdown(self, wait: bool = True) -> None:
@@ -331,8 +436,19 @@ class ReplicationManager:
s3.delete_object(Bucket=rule.target_bucket, Key=object_key)
logger.info(f"Replicated DELETE {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
self._update_last_sync(bucket_name, object_key)
self.failure_store.remove_failure(bucket_name, object_key)
except ClientError as e:
error_code = e.response.get('Error', {}).get('Code')
logger.error(f"Replication DELETE failed for {bucket_name}/{object_key}: {e}")
self.failure_store.add_failure(bucket_name, ReplicationFailure(
object_key=object_key,
error_message=str(e),
timestamp=time.time(),
failure_count=1,
bucket_name=bucket_name,
action="delete",
last_error_code=error_code,
))
return
try:
@@ -405,9 +521,89 @@ class ReplicationManager:
logger.info(f"Replicated {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
self._update_last_sync(bucket_name, object_key)
self.failure_store.remove_failure(bucket_name, object_key)
except (ClientError, OSError, ValueError) as e:
error_code = None
if isinstance(e, ClientError):
error_code = e.response.get('Error', {}).get('Code')
logger.error(f"Replication failed for {bucket_name}/{object_key}: {e}")
except Exception:
self.failure_store.add_failure(bucket_name, ReplicationFailure(
object_key=object_key,
error_message=str(e),
timestamp=time.time(),
failure_count=1,
bucket_name=bucket_name,
action=action,
last_error_code=error_code,
))
except Exception as e:
logger.exception(f"Unexpected error during replication for {bucket_name}/{object_key}")
self.failure_store.add_failure(bucket_name, ReplicationFailure(
object_key=object_key,
error_message=str(e),
timestamp=time.time(),
failure_count=1,
bucket_name=bucket_name,
action=action,
last_error_code=None,
))
def get_failed_items(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[ReplicationFailure]:
failures = self.failure_store.load_failures(bucket_name)
return failures[offset:offset + limit]
def get_failure_count(self, bucket_name: str) -> int:
return self.failure_store.get_failure_count(bucket_name)
def retry_failed_item(self, bucket_name: str, object_key: str) -> bool:
failure = self.failure_store.get_failure(bucket_name, object_key)
if not failure:
return False
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return False
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning(f"Cannot retry: Connection {rule.target_connection_id} not found")
return False
if not self.check_endpoint_health(connection):
logger.warning(f"Cannot retry: Endpoint {connection.name} is not reachable")
return False
self._executor.submit(self._replicate_task, bucket_name, object_key, rule, connection, failure.action)
return True
def retry_all_failed(self, bucket_name: str) -> Dict[str, int]:
failures = self.failure_store.load_failures(bucket_name)
if not failures:
return {"submitted": 0, "skipped": 0}
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return {"submitted": 0, "skipped": len(failures)}
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning(f"Cannot retry: Connection {rule.target_connection_id} not found")
return {"submitted": 0, "skipped": len(failures)}
if not self.check_endpoint_health(connection):
logger.warning(f"Cannot retry: Endpoint {connection.name} is not reachable")
return {"submitted": 0, "skipped": len(failures)}
submitted = 0
for failure in failures:
self._executor.submit(self._replicate_task, bucket_name, failure.object_key, rule, connection, failure.action)
submitted += 1
return {"submitted": submitted, "skipped": 0}
def dismiss_failure(self, bucket_name: str, object_key: str) -> bool:
return self.failure_store.remove_failure(bucket_name, object_key)
def clear_failures(self, bucket_name: str) -> None:
self.failure_store.clear_failures(bucket_name)

109
app/ui.py
View File

@@ -1590,6 +1590,84 @@ def get_replication_status(bucket_name: str):
})
@ui_bp.get("/buckets/<bucket_name>/replication/failures")
def get_replication_failures(bucket_name: str):
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "replication")
except IamError:
return jsonify({"error": "Access denied"}), 403
limit = request.args.get("limit", 50, type=int)
offset = request.args.get("offset", 0, type=int)
failures = _replication().get_failed_items(bucket_name, limit, offset)
total = _replication().get_failure_count(bucket_name)
return jsonify({
"failures": [f.to_dict() for f in failures],
"total": total,
"limit": limit,
"offset": offset,
})
@ui_bp.post("/buckets/<bucket_name>/replication/failures/<path:object_key>/retry")
def retry_replication_failure(bucket_name: str, object_key: str):
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "replication")
except IamError:
return jsonify({"error": "Access denied"}), 403
success = _replication().retry_failed_item(bucket_name, object_key)
if success:
return jsonify({"status": "submitted", "object_key": object_key})
return jsonify({"error": "Failed to submit retry"}), 400
@ui_bp.post("/buckets/<bucket_name>/replication/failures/retry-all")
def retry_all_replication_failures(bucket_name: str):
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "replication")
except IamError:
return jsonify({"error": "Access denied"}), 403
result = _replication().retry_all_failed(bucket_name)
return jsonify({
"status": "submitted",
"submitted": result["submitted"],
"skipped": result["skipped"],
})
@ui_bp.delete("/buckets/<bucket_name>/replication/failures/<path:object_key>")
def dismiss_replication_failure(bucket_name: str, object_key: str):
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "replication")
except IamError:
return jsonify({"error": "Access denied"}), 403
success = _replication().dismiss_failure(bucket_name, object_key)
if success:
return jsonify({"status": "dismissed", "object_key": object_key})
return jsonify({"error": "Failure not found"}), 404
@ui_bp.delete("/buckets/<bucket_name>/replication/failures")
def clear_replication_failures(bucket_name: str):
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "replication")
except IamError:
return jsonify({"error": "Access denied"}), 403
_replication().clear_failures(bucket_name)
return jsonify({"status": "cleared"})
@ui_bp.get("/connections/<connection_id>/health")
def check_connection_health(connection_id: str):
"""Check if a connection endpoint is reachable."""
@@ -1742,6 +1820,37 @@ def bucket_lifecycle(bucket_name: str):
return jsonify({"status": "ok", "message": "Lifecycle configuration saved", "rules": validated_rules})
@ui_bp.get("/buckets/<bucket_name>/lifecycle/history")
def get_lifecycle_history(bucket_name: str):
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "policy")
except IamError:
return jsonify({"error": "Access denied"}), 403
limit = request.args.get("limit", 50, type=int)
offset = request.args.get("offset", 0, type=int)
lifecycle_manager = current_app.extensions.get("lifecycle")
if not lifecycle_manager:
return jsonify({
"executions": [],
"total": 0,
"limit": limit,
"offset": offset,
"enabled": False,
})
records = lifecycle_manager.get_execution_history(bucket_name, limit, offset)
return jsonify({
"executions": [r.to_dict() for r in records],
"total": len(lifecycle_manager.get_execution_history(bucket_name, 1000, 0)),
"limit": limit,
"offset": offset,
"enabled": True,
})
@ui_bp.route("/buckets/<bucket_name>/cors", methods=["GET", "POST", "DELETE"])
def bucket_cors(bucket_name: str):
principal = _current_principal()