Add replication failure tracking and lifecycle execution history
This commit is contained in:
@@ -124,7 +124,7 @@ def create_app(
|
||||
)
|
||||
|
||||
connections = ConnectionStore(connections_path)
|
||||
replication = ReplicationManager(storage, connections, replication_rules_path)
|
||||
replication = ReplicationManager(storage, connections, replication_rules_path, storage_root)
|
||||
|
||||
encryption_config = {
|
||||
"encryption_enabled": app.config.get("ENCRYPTION_ENABLED", False),
|
||||
@@ -156,6 +156,7 @@ def create_app(
|
||||
lifecycle_manager = LifecycleManager(
|
||||
base_storage,
|
||||
interval_seconds=app.config.get("LIFECYCLE_INTERVAL_SECONDS", 3600),
|
||||
storage_root=storage_root,
|
||||
)
|
||||
lifecycle_manager.start()
|
||||
|
||||
|
||||
104
app/lifecycle.py
104
app/lifecycle.py
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
@@ -23,13 +24,104 @@ class LifecycleResult:
|
||||
execution_time_seconds: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class LifecycleExecutionRecord:
|
||||
timestamp: float
|
||||
bucket_name: str
|
||||
objects_deleted: int
|
||||
versions_deleted: int
|
||||
uploads_aborted: int
|
||||
errors: List[str]
|
||||
execution_time_seconds: float
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"timestamp": self.timestamp,
|
||||
"bucket_name": self.bucket_name,
|
||||
"objects_deleted": self.objects_deleted,
|
||||
"versions_deleted": self.versions_deleted,
|
||||
"uploads_aborted": self.uploads_aborted,
|
||||
"errors": self.errors,
|
||||
"execution_time_seconds": self.execution_time_seconds,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "LifecycleExecutionRecord":
|
||||
return cls(
|
||||
timestamp=data["timestamp"],
|
||||
bucket_name=data["bucket_name"],
|
||||
objects_deleted=data["objects_deleted"],
|
||||
versions_deleted=data["versions_deleted"],
|
||||
uploads_aborted=data["uploads_aborted"],
|
||||
errors=data.get("errors", []),
|
||||
execution_time_seconds=data["execution_time_seconds"],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_result(cls, result: LifecycleResult) -> "LifecycleExecutionRecord":
|
||||
return cls(
|
||||
timestamp=time.time(),
|
||||
bucket_name=result.bucket_name,
|
||||
objects_deleted=result.objects_deleted,
|
||||
versions_deleted=result.versions_deleted,
|
||||
uploads_aborted=result.uploads_aborted,
|
||||
errors=result.errors.copy(),
|
||||
execution_time_seconds=result.execution_time_seconds,
|
||||
)
|
||||
|
||||
|
||||
class LifecycleHistoryStore:
|
||||
MAX_HISTORY_PER_BUCKET = 50
|
||||
|
||||
def __init__(self, storage_root: Path) -> None:
|
||||
self.storage_root = storage_root
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def _get_history_path(self, bucket_name: str) -> Path:
|
||||
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "lifecycle_history.json"
|
||||
|
||||
def load_history(self, bucket_name: str) -> List[LifecycleExecutionRecord]:
|
||||
path = self._get_history_path(bucket_name)
|
||||
if not path.exists():
|
||||
return []
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
data = json.load(f)
|
||||
return [LifecycleExecutionRecord.from_dict(d) for d in data.get("executions", [])]
|
||||
except (OSError, ValueError, KeyError) as e:
|
||||
logger.error(f"Failed to load lifecycle history for {bucket_name}: {e}")
|
||||
return []
|
||||
|
||||
def save_history(self, bucket_name: str, records: List[LifecycleExecutionRecord]) -> None:
|
||||
path = self._get_history_path(bucket_name)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
data = {"executions": [r.to_dict() for r in records[:self.MAX_HISTORY_PER_BUCKET]]}
|
||||
try:
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to save lifecycle history for {bucket_name}: {e}")
|
||||
|
||||
def add_record(self, bucket_name: str, record: LifecycleExecutionRecord) -> None:
|
||||
with self._lock:
|
||||
records = self.load_history(bucket_name)
|
||||
records.insert(0, record)
|
||||
self.save_history(bucket_name, records)
|
||||
|
||||
def get_history(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[LifecycleExecutionRecord]:
|
||||
records = self.load_history(bucket_name)
|
||||
return records[offset:offset + limit]
|
||||
|
||||
|
||||
class LifecycleManager:
|
||||
def __init__(self, storage: ObjectStorage, interval_seconds: int = 3600):
|
||||
def __init__(self, storage: ObjectStorage, interval_seconds: int = 3600, storage_root: Optional[Path] = None):
|
||||
self.storage = storage
|
||||
self.interval_seconds = interval_seconds
|
||||
self.storage_root = storage_root
|
||||
self._timer: Optional[threading.Timer] = None
|
||||
self._shutdown = False
|
||||
self._lock = threading.Lock()
|
||||
self.history_store = LifecycleHistoryStore(storage_root) if storage_root else None
|
||||
|
||||
def start(self) -> None:
|
||||
if self._timer is not None:
|
||||
@@ -98,12 +190,15 @@ class LifecycleManager:
|
||||
logger.error(f"Lifecycle enforcement error for {bucket_name}: {e}")
|
||||
|
||||
result.execution_time_seconds = time.time() - start_time
|
||||
if result.objects_deleted > 0 or result.versions_deleted > 0 or result.uploads_aborted > 0:
|
||||
if result.objects_deleted > 0 or result.versions_deleted > 0 or result.uploads_aborted > 0 or result.errors:
|
||||
logger.info(
|
||||
f"Lifecycle enforcement for {bucket_name}: "
|
||||
f"deleted={result.objects_deleted}, versions={result.versions_deleted}, "
|
||||
f"aborted={result.uploads_aborted}, time={result.execution_time_seconds:.2f}s"
|
||||
)
|
||||
if self.history_store:
|
||||
record = LifecycleExecutionRecord.from_result(result)
|
||||
self.history_store.add_record(bucket_name, record)
|
||||
return result
|
||||
|
||||
def _enforce_expiration(
|
||||
@@ -233,3 +328,8 @@ class LifecycleManager:
|
||||
if bucket_name:
|
||||
return {bucket_name: self.enforce_rules(bucket_name)}
|
||||
return self.enforce_all_buckets()
|
||||
|
||||
def get_execution_history(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[LifecycleExecutionRecord]:
|
||||
if not self.history_store:
|
||||
return []
|
||||
return self.history_store.get_history(bucket_name, limit, offset)
|
||||
|
||||
@@ -8,7 +8,7 @@ import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
@@ -87,6 +87,40 @@ class ReplicationStats:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReplicationFailure:
|
||||
object_key: str
|
||||
error_message: str
|
||||
timestamp: float
|
||||
failure_count: int
|
||||
bucket_name: str
|
||||
action: str
|
||||
last_error_code: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"object_key": self.object_key,
|
||||
"error_message": self.error_message,
|
||||
"timestamp": self.timestamp,
|
||||
"failure_count": self.failure_count,
|
||||
"bucket_name": self.bucket_name,
|
||||
"action": self.action,
|
||||
"last_error_code": self.last_error_code,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "ReplicationFailure":
|
||||
return cls(
|
||||
object_key=data["object_key"],
|
||||
error_message=data["error_message"],
|
||||
timestamp=data["timestamp"],
|
||||
failure_count=data["failure_count"],
|
||||
bucket_name=data["bucket_name"],
|
||||
action=data["action"],
|
||||
last_error_code=data.get("last_error_code"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReplicationRule:
|
||||
bucket_name: str
|
||||
@@ -120,15 +154,86 @@ class ReplicationRule:
|
||||
return rule
|
||||
|
||||
|
||||
class ReplicationFailureStore:
|
||||
MAX_FAILURES_PER_BUCKET = 50
|
||||
|
||||
def __init__(self, storage_root: Path) -> None:
|
||||
self.storage_root = storage_root
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def _get_failures_path(self, bucket_name: str) -> Path:
|
||||
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "replication_failures.json"
|
||||
|
||||
def load_failures(self, bucket_name: str) -> List[ReplicationFailure]:
|
||||
path = self._get_failures_path(bucket_name)
|
||||
if not path.exists():
|
||||
return []
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
data = json.load(f)
|
||||
return [ReplicationFailure.from_dict(d) for d in data.get("failures", [])]
|
||||
except (OSError, ValueError, KeyError) as e:
|
||||
logger.error(f"Failed to load replication failures for {bucket_name}: {e}")
|
||||
return []
|
||||
|
||||
def save_failures(self, bucket_name: str, failures: List[ReplicationFailure]) -> None:
|
||||
path = self._get_failures_path(bucket_name)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
data = {"failures": [f.to_dict() for f in failures[:self.MAX_FAILURES_PER_BUCKET]]}
|
||||
try:
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to save replication failures for {bucket_name}: {e}")
|
||||
|
||||
def add_failure(self, bucket_name: str, failure: ReplicationFailure) -> None:
|
||||
with self._lock:
|
||||
failures = self.load_failures(bucket_name)
|
||||
existing = next((f for f in failures if f.object_key == failure.object_key), None)
|
||||
if existing:
|
||||
existing.failure_count += 1
|
||||
existing.timestamp = failure.timestamp
|
||||
existing.error_message = failure.error_message
|
||||
existing.last_error_code = failure.last_error_code
|
||||
else:
|
||||
failures.insert(0, failure)
|
||||
self.save_failures(bucket_name, failures)
|
||||
|
||||
def remove_failure(self, bucket_name: str, object_key: str) -> bool:
|
||||
with self._lock:
|
||||
failures = self.load_failures(bucket_name)
|
||||
original_len = len(failures)
|
||||
failures = [f for f in failures if f.object_key != object_key]
|
||||
if len(failures) < original_len:
|
||||
self.save_failures(bucket_name, failures)
|
||||
return True
|
||||
return False
|
||||
|
||||
def clear_failures(self, bucket_name: str) -> None:
|
||||
with self._lock:
|
||||
path = self._get_failures_path(bucket_name)
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
|
||||
def get_failure(self, bucket_name: str, object_key: str) -> Optional[ReplicationFailure]:
|
||||
failures = self.load_failures(bucket_name)
|
||||
return next((f for f in failures if f.object_key == object_key), None)
|
||||
|
||||
def get_failure_count(self, bucket_name: str) -> int:
|
||||
return len(self.load_failures(bucket_name))
|
||||
|
||||
|
||||
class ReplicationManager:
|
||||
def __init__(self, storage: ObjectStorage, connections: ConnectionStore, rules_path: Path) -> None:
|
||||
def __init__(self, storage: ObjectStorage, connections: ConnectionStore, rules_path: Path, storage_root: Path) -> None:
|
||||
self.storage = storage
|
||||
self.connections = connections
|
||||
self.rules_path = rules_path
|
||||
self.storage_root = storage_root
|
||||
self._rules: Dict[str, ReplicationRule] = {}
|
||||
self._stats_lock = threading.Lock()
|
||||
self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ReplicationWorker")
|
||||
self._shutdown = False
|
||||
self.failure_store = ReplicationFailureStore(storage_root)
|
||||
self.reload_rules()
|
||||
|
||||
def shutdown(self, wait: bool = True) -> None:
|
||||
@@ -331,8 +436,19 @@ class ReplicationManager:
|
||||
s3.delete_object(Bucket=rule.target_bucket, Key=object_key)
|
||||
logger.info(f"Replicated DELETE {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
|
||||
self._update_last_sync(bucket_name, object_key)
|
||||
self.failure_store.remove_failure(bucket_name, object_key)
|
||||
except ClientError as e:
|
||||
error_code = e.response.get('Error', {}).get('Code')
|
||||
logger.error(f"Replication DELETE failed for {bucket_name}/{object_key}: {e}")
|
||||
self.failure_store.add_failure(bucket_name, ReplicationFailure(
|
||||
object_key=object_key,
|
||||
error_message=str(e),
|
||||
timestamp=time.time(),
|
||||
failure_count=1,
|
||||
bucket_name=bucket_name,
|
||||
action="delete",
|
||||
last_error_code=error_code,
|
||||
))
|
||||
return
|
||||
|
||||
try:
|
||||
@@ -405,9 +521,89 @@ class ReplicationManager:
|
||||
|
||||
logger.info(f"Replicated {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
|
||||
self._update_last_sync(bucket_name, object_key)
|
||||
self.failure_store.remove_failure(bucket_name, object_key)
|
||||
|
||||
except (ClientError, OSError, ValueError) as e:
|
||||
error_code = None
|
||||
if isinstance(e, ClientError):
|
||||
error_code = e.response.get('Error', {}).get('Code')
|
||||
logger.error(f"Replication failed for {bucket_name}/{object_key}: {e}")
|
||||
except Exception:
|
||||
self.failure_store.add_failure(bucket_name, ReplicationFailure(
|
||||
object_key=object_key,
|
||||
error_message=str(e),
|
||||
timestamp=time.time(),
|
||||
failure_count=1,
|
||||
bucket_name=bucket_name,
|
||||
action=action,
|
||||
last_error_code=error_code,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.exception(f"Unexpected error during replication for {bucket_name}/{object_key}")
|
||||
self.failure_store.add_failure(bucket_name, ReplicationFailure(
|
||||
object_key=object_key,
|
||||
error_message=str(e),
|
||||
timestamp=time.time(),
|
||||
failure_count=1,
|
||||
bucket_name=bucket_name,
|
||||
action=action,
|
||||
last_error_code=None,
|
||||
))
|
||||
|
||||
def get_failed_items(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[ReplicationFailure]:
|
||||
failures = self.failure_store.load_failures(bucket_name)
|
||||
return failures[offset:offset + limit]
|
||||
|
||||
def get_failure_count(self, bucket_name: str) -> int:
|
||||
return self.failure_store.get_failure_count(bucket_name)
|
||||
|
||||
def retry_failed_item(self, bucket_name: str, object_key: str) -> bool:
|
||||
failure = self.failure_store.get_failure(bucket_name, object_key)
|
||||
if not failure:
|
||||
return False
|
||||
|
||||
rule = self.get_rule(bucket_name)
|
||||
if not rule or not rule.enabled:
|
||||
return False
|
||||
|
||||
connection = self.connections.get(rule.target_connection_id)
|
||||
if not connection:
|
||||
logger.warning(f"Cannot retry: Connection {rule.target_connection_id} not found")
|
||||
return False
|
||||
|
||||
if not self.check_endpoint_health(connection):
|
||||
logger.warning(f"Cannot retry: Endpoint {connection.name} is not reachable")
|
||||
return False
|
||||
|
||||
self._executor.submit(self._replicate_task, bucket_name, object_key, rule, connection, failure.action)
|
||||
return True
|
||||
|
||||
def retry_all_failed(self, bucket_name: str) -> Dict[str, int]:
|
||||
failures = self.failure_store.load_failures(bucket_name)
|
||||
if not failures:
|
||||
return {"submitted": 0, "skipped": 0}
|
||||
|
||||
rule = self.get_rule(bucket_name)
|
||||
if not rule or not rule.enabled:
|
||||
return {"submitted": 0, "skipped": len(failures)}
|
||||
|
||||
connection = self.connections.get(rule.target_connection_id)
|
||||
if not connection:
|
||||
logger.warning(f"Cannot retry: Connection {rule.target_connection_id} not found")
|
||||
return {"submitted": 0, "skipped": len(failures)}
|
||||
|
||||
if not self.check_endpoint_health(connection):
|
||||
logger.warning(f"Cannot retry: Endpoint {connection.name} is not reachable")
|
||||
return {"submitted": 0, "skipped": len(failures)}
|
||||
|
||||
submitted = 0
|
||||
for failure in failures:
|
||||
self._executor.submit(self._replicate_task, bucket_name, failure.object_key, rule, connection, failure.action)
|
||||
submitted += 1
|
||||
|
||||
return {"submitted": submitted, "skipped": 0}
|
||||
|
||||
def dismiss_failure(self, bucket_name: str, object_key: str) -> bool:
|
||||
return self.failure_store.remove_failure(bucket_name, object_key)
|
||||
|
||||
def clear_failures(self, bucket_name: str) -> None:
|
||||
self.failure_store.clear_failures(bucket_name)
|
||||
|
||||
109
app/ui.py
109
app/ui.py
@@ -1590,6 +1590,84 @@ def get_replication_status(bucket_name: str):
|
||||
})
|
||||
|
||||
|
||||
@ui_bp.get("/buckets/<bucket_name>/replication/failures")
|
||||
def get_replication_failures(bucket_name: str):
|
||||
principal = _current_principal()
|
||||
try:
|
||||
_authorize_ui(principal, bucket_name, "replication")
|
||||
except IamError:
|
||||
return jsonify({"error": "Access denied"}), 403
|
||||
|
||||
limit = request.args.get("limit", 50, type=int)
|
||||
offset = request.args.get("offset", 0, type=int)
|
||||
|
||||
failures = _replication().get_failed_items(bucket_name, limit, offset)
|
||||
total = _replication().get_failure_count(bucket_name)
|
||||
|
||||
return jsonify({
|
||||
"failures": [f.to_dict() for f in failures],
|
||||
"total": total,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
})
|
||||
|
||||
|
||||
@ui_bp.post("/buckets/<bucket_name>/replication/failures/<path:object_key>/retry")
|
||||
def retry_replication_failure(bucket_name: str, object_key: str):
|
||||
principal = _current_principal()
|
||||
try:
|
||||
_authorize_ui(principal, bucket_name, "replication")
|
||||
except IamError:
|
||||
return jsonify({"error": "Access denied"}), 403
|
||||
|
||||
success = _replication().retry_failed_item(bucket_name, object_key)
|
||||
if success:
|
||||
return jsonify({"status": "submitted", "object_key": object_key})
|
||||
return jsonify({"error": "Failed to submit retry"}), 400
|
||||
|
||||
|
||||
@ui_bp.post("/buckets/<bucket_name>/replication/failures/retry-all")
|
||||
def retry_all_replication_failures(bucket_name: str):
|
||||
principal = _current_principal()
|
||||
try:
|
||||
_authorize_ui(principal, bucket_name, "replication")
|
||||
except IamError:
|
||||
return jsonify({"error": "Access denied"}), 403
|
||||
|
||||
result = _replication().retry_all_failed(bucket_name)
|
||||
return jsonify({
|
||||
"status": "submitted",
|
||||
"submitted": result["submitted"],
|
||||
"skipped": result["skipped"],
|
||||
})
|
||||
|
||||
|
||||
@ui_bp.delete("/buckets/<bucket_name>/replication/failures/<path:object_key>")
|
||||
def dismiss_replication_failure(bucket_name: str, object_key: str):
|
||||
principal = _current_principal()
|
||||
try:
|
||||
_authorize_ui(principal, bucket_name, "replication")
|
||||
except IamError:
|
||||
return jsonify({"error": "Access denied"}), 403
|
||||
|
||||
success = _replication().dismiss_failure(bucket_name, object_key)
|
||||
if success:
|
||||
return jsonify({"status": "dismissed", "object_key": object_key})
|
||||
return jsonify({"error": "Failure not found"}), 404
|
||||
|
||||
|
||||
@ui_bp.delete("/buckets/<bucket_name>/replication/failures")
|
||||
def clear_replication_failures(bucket_name: str):
|
||||
principal = _current_principal()
|
||||
try:
|
||||
_authorize_ui(principal, bucket_name, "replication")
|
||||
except IamError:
|
||||
return jsonify({"error": "Access denied"}), 403
|
||||
|
||||
_replication().clear_failures(bucket_name)
|
||||
return jsonify({"status": "cleared"})
|
||||
|
||||
|
||||
@ui_bp.get("/connections/<connection_id>/health")
|
||||
def check_connection_health(connection_id: str):
|
||||
"""Check if a connection endpoint is reachable."""
|
||||
@@ -1742,6 +1820,37 @@ def bucket_lifecycle(bucket_name: str):
|
||||
return jsonify({"status": "ok", "message": "Lifecycle configuration saved", "rules": validated_rules})
|
||||
|
||||
|
||||
@ui_bp.get("/buckets/<bucket_name>/lifecycle/history")
|
||||
def get_lifecycle_history(bucket_name: str):
|
||||
principal = _current_principal()
|
||||
try:
|
||||
_authorize_ui(principal, bucket_name, "policy")
|
||||
except IamError:
|
||||
return jsonify({"error": "Access denied"}), 403
|
||||
|
||||
limit = request.args.get("limit", 50, type=int)
|
||||
offset = request.args.get("offset", 0, type=int)
|
||||
|
||||
lifecycle_manager = current_app.extensions.get("lifecycle")
|
||||
if not lifecycle_manager:
|
||||
return jsonify({
|
||||
"executions": [],
|
||||
"total": 0,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"enabled": False,
|
||||
})
|
||||
|
||||
records = lifecycle_manager.get_execution_history(bucket_name, limit, offset)
|
||||
return jsonify({
|
||||
"executions": [r.to_dict() for r in records],
|
||||
"total": len(lifecycle_manager.get_execution_history(bucket_name, 1000, 0)),
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"enabled": True,
|
||||
})
|
||||
|
||||
|
||||
@ui_bp.route("/buckets/<bucket_name>/cors", methods=["GET", "POST", "DELETE"])
|
||||
def bucket_cors(bucket_name: str):
|
||||
principal = _current_principal()
|
||||
|
||||
Reference in New Issue
Block a user