UI overhaul; Replication and S3 API improvements

This commit is contained in:
2025-11-25 14:42:33 +08:00
parent cee28c9f81
commit b2f4d1b5db
16 changed files with 3497 additions and 673 deletions

View File

@@ -105,6 +105,18 @@ def create_app(
value /= 1024.0
return f"{value:.1f} PB"
@app.template_filter("timestamp_to_datetime")
def timestamp_to_datetime(value: float) -> str:
"""Format Unix timestamp as human-readable datetime."""
from datetime import datetime
if not value:
return "Never"
try:
dt = datetime.fromtimestamp(value)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except (ValueError, OSError):
return "Unknown"
if include_api:
from .s3_api import s3_api_bp

View File

@@ -65,6 +65,7 @@ class AppConfig:
secret_ttl_seconds: int
stream_chunk_size: int
multipart_min_part_size: int
bucket_stats_cache_ttl: int
@classmethod
def from_env(cls, overrides: Optional[Dict[str, Any]] = None) -> "AppConfig":
@@ -85,8 +86,6 @@ class AppConfig:
default_secret = "dev-secret-key"
secret_key = str(_get("SECRET_KEY", default_secret))
# If using default/missing secret, try to load/persist a generated one from disk
# This ensures consistency across Gunicorn workers
if not secret_key or secret_key == default_secret:
secret_file = storage_root / ".myfsio.sys" / "config" / ".secret"
if secret_file.exists():
@@ -100,7 +99,6 @@ class AppConfig:
secret_file.write_text(generated)
secret_key = generated
except OSError:
# Fallback if we can't write to disk (e.g. read-only fs)
secret_key = generated
iam_env_override = "IAM_CONFIG" in overrides or "IAM_CONFIG" in os.environ
@@ -156,6 +154,7 @@ class AppConfig:
"X-Amz-Signature",
])
session_lifetime_days = int(_get("SESSION_LIFETIME_DAYS", 30))
bucket_stats_cache_ttl = int(_get("BUCKET_STATS_CACHE_TTL", 60)) # Default 60 seconds
return cls(storage_root=storage_root,
max_upload_size=max_upload_size,
@@ -182,7 +181,8 @@ class AppConfig:
bulk_delete_max_keys=bulk_delete_max_keys,
secret_ttl_seconds=secret_ttl_seconds,
stream_chunk_size=stream_chunk_size,
multipart_min_part_size=multipart_min_part_size)
multipart_min_part_size=multipart_min_part_size,
bucket_stats_cache_ttl=bucket_stats_cache_ttl)
def to_flask_config(self) -> Dict[str, Any]:
return {
@@ -202,6 +202,7 @@ class AppConfig:
"SECRET_TTL_SECONDS": self.secret_ttl_seconds,
"STREAM_CHUNK_SIZE": self.stream_chunk_size,
"MULTIPART_MIN_PART_SIZE": self.multipart_min_part_size,
"BUCKET_STATS_CACHE_TTL": self.bucket_stats_cache_ttl,
"LOG_LEVEL": self.log_level,
"LOG_FILE": str(self.log_path),
"LOG_MAX_BYTES": self.log_max_bytes,

View File

@@ -1,11 +1,13 @@
"""Background replication worker."""
from __future__ import annotations
import json
import logging
import mimetypes
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Optional
@@ -21,6 +23,41 @@ logger = logging.getLogger(__name__)
REPLICATION_USER_AGENT = "S3ReplicationAgent/1.0"
REPLICATION_MODE_NEW_ONLY = "new_only"
REPLICATION_MODE_ALL = "all"
@dataclass
class ReplicationStats:
"""Statistics for replication operations - computed dynamically."""
objects_synced: int = 0 # Objects that exist in both source and destination
objects_pending: int = 0 # Objects in source but not in destination
objects_orphaned: int = 0 # Objects in destination but not in source (will be deleted)
bytes_synced: int = 0 # Total bytes synced to destination
last_sync_at: Optional[float] = None
last_sync_key: Optional[str] = None
def to_dict(self) -> dict:
return {
"objects_synced": self.objects_synced,
"objects_pending": self.objects_pending,
"objects_orphaned": self.objects_orphaned,
"bytes_synced": self.bytes_synced,
"last_sync_at": self.last_sync_at,
"last_sync_key": self.last_sync_key,
}
@classmethod
def from_dict(cls, data: dict) -> "ReplicationStats":
return cls(
objects_synced=data.get("objects_synced", 0),
objects_pending=data.get("objects_pending", 0),
objects_orphaned=data.get("objects_orphaned", 0),
bytes_synced=data.get("bytes_synced", 0),
last_sync_at=data.get("last_sync_at"),
last_sync_key=data.get("last_sync_key"),
)
@dataclass
class ReplicationRule:
@@ -28,6 +65,32 @@ class ReplicationRule:
target_connection_id: str
target_bucket: str
enabled: bool = True
mode: str = REPLICATION_MODE_NEW_ONLY
created_at: Optional[float] = None
stats: ReplicationStats = field(default_factory=ReplicationStats)
def to_dict(self) -> dict:
return {
"bucket_name": self.bucket_name,
"target_connection_id": self.target_connection_id,
"target_bucket": self.target_bucket,
"enabled": self.enabled,
"mode": self.mode,
"created_at": self.created_at,
"stats": self.stats.to_dict(),
}
@classmethod
def from_dict(cls, data: dict) -> "ReplicationRule":
stats_data = data.pop("stats", {})
# Handle old rules without mode/created_at
if "mode" not in data:
data["mode"] = REPLICATION_MODE_NEW_ONLY
if "created_at" not in data:
data["created_at"] = None
rule = cls(**data)
rule.stats = ReplicationStats.from_dict(stats_data) if stats_data else ReplicationStats()
return rule
class ReplicationManager:
@@ -36,6 +99,7 @@ class ReplicationManager:
self.connections = connections
self.rules_path = rules_path
self._rules: Dict[str, ReplicationRule] = {}
self._stats_lock = threading.Lock()
self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ReplicationWorker")
self.reload_rules()
@@ -44,17 +108,15 @@ class ReplicationManager:
self._rules = {}
return
try:
import json
with open(self.rules_path, "r") as f:
data = json.load(f)
for bucket, rule_data in data.items():
self._rules[bucket] = ReplicationRule(**rule_data)
self._rules[bucket] = ReplicationRule.from_dict(rule_data)
except (OSError, ValueError) as e:
logger.error(f"Failed to load replication rules: {e}")
def save_rules(self) -> None:
import json
data = {b: rule.__dict__ for b, rule in self._rules.items()}
data = {b: rule.to_dict() for b, rule in self._rules.items()}
self.rules_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.rules_path, "w") as f:
json.dump(data, f, indent=2)
@@ -70,6 +132,99 @@ class ReplicationManager:
if bucket_name in self._rules:
del self._rules[bucket_name]
self.save_rules()
def _update_last_sync(self, bucket_name: str, object_key: str = "") -> None:
"""Update last sync timestamp after a successful operation."""
with self._stats_lock:
rule = self._rules.get(bucket_name)
if not rule:
return
rule.stats.last_sync_at = time.time()
rule.stats.last_sync_key = object_key
self.save_rules()
def get_sync_status(self, bucket_name: str) -> Optional[ReplicationStats]:
"""Dynamically compute replication status by comparing source and destination buckets."""
rule = self.get_rule(bucket_name)
if not rule:
return None
connection = self.connections.get(rule.target_connection_id)
if not connection:
return rule.stats # Return cached stats if connection unavailable
try:
# Get source objects
source_objects = self.storage.list_objects(bucket_name)
source_keys = {obj.key: obj.size for obj in source_objects}
# Get destination objects
s3 = boto3.client(
"s3",
endpoint_url=connection.endpoint_url,
aws_access_key_id=connection.access_key,
aws_secret_access_key=connection.secret_key,
region_name=connection.region,
)
dest_keys = set()
bytes_synced = 0
paginator = s3.get_paginator('list_objects_v2')
try:
for page in paginator.paginate(Bucket=rule.target_bucket):
for obj in page.get('Contents', []):
dest_keys.add(obj['Key'])
if obj['Key'] in source_keys:
bytes_synced += obj.get('Size', 0)
except ClientError as e:
if e.response['Error']['Code'] == 'NoSuchBucket':
# Destination bucket doesn't exist yet
dest_keys = set()
else:
raise
# Compute stats
synced = source_keys.keys() & dest_keys # Objects in both
orphaned = dest_keys - source_keys.keys() # In dest but not source
# For "new_only" mode, we can't determine pending since we don't know
# which objects existed before replication was enabled. Only "all" mode
# should show pending (objects that should be replicated but aren't yet).
if rule.mode == REPLICATION_MODE_ALL:
pending = source_keys.keys() - dest_keys # In source but not dest
else:
pending = set() # New-only mode: don't show pre-existing as pending
# Update cached stats with computed values
rule.stats.objects_synced = len(synced)
rule.stats.objects_pending = len(pending)
rule.stats.objects_orphaned = len(orphaned)
rule.stats.bytes_synced = bytes_synced
return rule.stats
except (ClientError, StorageError) as e:
logger.error(f"Failed to compute sync status for {bucket_name}: {e}")
return rule.stats # Return cached stats on error
def replicate_existing_objects(self, bucket_name: str) -> None:
"""Trigger replication for all existing objects in a bucket."""
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning(f"Cannot replicate existing objects: Connection {rule.target_connection_id} not found")
return
try:
objects = self.storage.list_objects(bucket_name)
logger.info(f"Starting replication of {len(objects)} existing objects from {bucket_name}")
for obj in objects:
self._executor.submit(self._replicate_task, bucket_name, obj.key, rule, connection, "write")
except StorageError as e:
logger.error(f"Failed to list objects for replication: {e}")
def create_remote_bucket(self, connection_id: str, bucket_name: str) -> None:
"""Create a bucket on the remote connection."""
@@ -103,6 +258,7 @@ class ReplicationManager:
self._executor.submit(self._replicate_task, bucket_name, object_key, rule, connection, action)
def _replicate_task(self, bucket_name: str, object_key: str, rule: ReplicationRule, conn: RemoteConnection, action: str) -> None:
file_size = 0
try:
# Using boto3 to upload
config = Config(user_agent_extra=REPLICATION_USER_AGENT)
@@ -119,21 +275,15 @@ class ReplicationManager:
try:
s3.delete_object(Bucket=rule.target_bucket, Key=object_key)
logger.info(f"Replicated DELETE {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
self._update_last_sync(bucket_name, object_key)
except ClientError as e:
logger.error(f"Replication DELETE failed for {bucket_name}/{object_key}: {e}")
return
# 1. Get local file path
# Note: We are accessing internal storage structure here.
# Ideally storage.py should expose a 'get_file_path' or we read the stream.
# For efficiency, we'll try to read the file directly if we can, or use storage.get_object
# We need the file content.
# Since ObjectStorage is filesystem based, let's get the stream.
# We need to be careful about closing it.
try:
path = self.storage.get_object_path(bucket_name, object_key)
except StorageError:
logger.error(f"Source object not found: {bucket_name}/{object_key}")
return
metadata = self.storage.get_object_metadata(bucket_name, object_key)
@@ -159,7 +309,6 @@ class ReplicationManager:
Metadata=metadata or {}
)
except (ClientError, S3UploadFailedError) as e:
# Check if it's a NoSuchBucket error (either direct or wrapped)
is_no_bucket = False
if isinstance(e, ClientError):
if e.response['Error']['Code'] == 'NoSuchBucket':
@@ -189,6 +338,7 @@ class ReplicationManager:
raise e
logger.info(f"Replicated {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
self._update_last_sync(bucket_name, object_key)
except (ClientError, OSError, ValueError) as e:
logger.error(f"Replication failed for {bucket_name}/{object_key}: {e}")

View File

@@ -584,6 +584,73 @@ def _render_tagging_document(tags: list[dict[str, str]]) -> Element:
SubElement(tag_el, "Value").text = tag.get("Value", "")
return root
DANGEROUS_CONTENT_TYPES = frozenset([
"text/html",
"application/xhtml+xml",
"application/javascript",
"text/javascript",
"application/x-javascript",
"text/ecmascript",
"application/ecmascript",
"image/svg+xml",
])
SAFE_EXTENSION_MAP = {
".txt": ["text/plain"],
".json": ["application/json"],
".xml": ["application/xml", "text/xml"],
".csv": ["text/csv"],
".pdf": ["application/pdf"],
".png": ["image/png"],
".jpg": ["image/jpeg"],
".jpeg": ["image/jpeg"],
".gif": ["image/gif"],
".webp": ["image/webp"],
".mp4": ["video/mp4"],
".mp3": ["audio/mpeg"],
".zip": ["application/zip"],
".gz": ["application/gzip"],
".tar": ["application/x-tar"],
}
def _validate_content_type(object_key: str, content_type: str | None) -> str | None:
"""Validate Content-Type header for security.
Returns an error message if validation fails, None otherwise.
Rules:
1. Block dangerous MIME types that can execute scripts (unless explicitly allowed)
2. Warn if Content-Type doesn't match file extension (but don't block)
"""
if not content_type:
return None
base_type = content_type.split(";")[0].strip().lower()
if base_type in DANGEROUS_CONTENT_TYPES:
ext = "." + object_key.rsplit(".", 1)[-1].lower() if "." in object_key else ""
allowed_dangerous = {
".svg": "image/svg+xml",
".html": "text/html",
".htm": "text/html",
".xhtml": "application/xhtml+xml",
".js": "application/javascript",
".mjs": "application/javascript",
}
if ext in allowed_dangerous and base_type == allowed_dangerous[ext]:
return None
return (
f"Content-Type '{content_type}' is potentially dangerous and not allowed "
f"for object key '{object_key}'. Use a safe Content-Type or rename the file "
f"with an appropriate extension."
)
return None
def _parse_cors_document(payload: bytes) -> list[dict[str, Any]]:
try:
@@ -731,6 +798,8 @@ def _maybe_handle_bucket_subresource(bucket_name: str) -> Response | None:
"tagging": _bucket_tagging_handler,
"cors": _bucket_cors_handler,
"encryption": _bucket_encryption_handler,
"location": _bucket_location_handler,
"acl": _bucket_acl_handler,
}
requested = [key for key in handlers if key in request.args]
if not requested:
@@ -746,8 +815,8 @@ def _maybe_handle_bucket_subresource(bucket_name: str) -> Response | None:
def _bucket_versioning_handler(bucket_name: str) -> Response:
if request.method != "GET":
return _method_not_allowed(["GET"])
if request.method not in {"GET", "PUT"}:
return _method_not_allowed(["GET", "PUT"])
principal, error = _require_principal()
if error:
return error
@@ -756,6 +825,31 @@ def _bucket_versioning_handler(bucket_name: str) -> Response:
except IamError as exc:
return _error_response("AccessDenied", str(exc), 403)
storage = _storage()
if request.method == "PUT":
payload = request.get_data(cache=False) or b""
if not payload.strip():
return _error_response("MalformedXML", "Request body is required", 400)
try:
root = fromstring(payload)
except ParseError:
return _error_response("MalformedXML", "Unable to parse XML document", 400)
if _strip_ns(root.tag) != "VersioningConfiguration":
return _error_response("MalformedXML", "Root element must be VersioningConfiguration", 400)
status_el = root.find("{*}Status")
if status_el is None:
status_el = root.find("Status")
status = (status_el.text or "").strip() if status_el is not None else ""
if status not in {"Enabled", "Suspended", ""}:
return _error_response("MalformedXML", "Status must be Enabled or Suspended", 400)
try:
storage.set_bucket_versioning(bucket_name, status == "Enabled")
except StorageError as exc:
return _error_response("NoSuchBucket", str(exc), 404)
current_app.logger.info("Bucket versioning updated", extra={"bucket": bucket_name, "status": status})
return Response(status=200)
# GET
try:
enabled = storage.is_versioning_enabled(bucket_name)
except StorageError as exc:
@@ -766,8 +860,8 @@ def _bucket_versioning_handler(bucket_name: str) -> Response:
def _bucket_tagging_handler(bucket_name: str) -> Response:
if request.method not in {"GET", "PUT"}:
return _method_not_allowed(["GET", "PUT"])
if request.method not in {"GET", "PUT", "DELETE"}:
return _method_not_allowed(["GET", "PUT", "DELETE"])
principal, error = _require_principal()
if error:
return error
@@ -784,6 +878,14 @@ def _bucket_tagging_handler(bucket_name: str) -> Response:
if not tags:
return _error_response("NoSuchTagSet", "No tags are configured for this bucket", 404)
return _xml_response(_render_tagging_document(tags))
if request.method == "DELETE":
try:
storage.set_bucket_tags(bucket_name, None)
except StorageError as exc:
return _error_response("NoSuchBucket", str(exc), 404)
current_app.logger.info("Bucket tags deleted", extra={"bucket": bucket_name})
return Response(status=204)
# PUT
payload = request.get_data(cache=False) or b""
try:
tags = _parse_tagging_document(payload)
@@ -799,6 +901,64 @@ def _bucket_tagging_handler(bucket_name: str) -> Response:
return Response(status=204)
def _object_tagging_handler(bucket_name: str, object_key: str) -> Response:
"""Handle object tagging operations (GET/PUT/DELETE /<bucket>/<key>?tagging)."""
if request.method not in {"GET", "PUT", "DELETE"}:
return _method_not_allowed(["GET", "PUT", "DELETE"])
principal, error = _require_principal()
if error:
return error
# For tagging, we use read permission for GET, write for PUT/DELETE
action = "read" if request.method == "GET" else "write"
try:
_authorize_action(principal, bucket_name, action, object_key=object_key)
except IamError as exc:
return _error_response("AccessDenied", str(exc), 403)
storage = _storage()
if request.method == "GET":
try:
tags = storage.get_object_tags(bucket_name, object_key)
except StorageError as exc:
message = str(exc)
if "Bucket" in message:
return _error_response("NoSuchBucket", message, 404)
return _error_response("NoSuchKey", message, 404)
return _xml_response(_render_tagging_document(tags))
if request.method == "DELETE":
try:
storage.delete_object_tags(bucket_name, object_key)
except StorageError as exc:
message = str(exc)
if "Bucket" in message:
return _error_response("NoSuchBucket", message, 404)
return _error_response("NoSuchKey", message, 404)
current_app.logger.info("Object tags deleted", extra={"bucket": bucket_name, "key": object_key})
return Response(status=204)
# PUT
payload = request.get_data(cache=False) or b""
try:
tags = _parse_tagging_document(payload)
except ValueError as exc:
return _error_response("MalformedXML", str(exc), 400)
if len(tags) > 10:
return _error_response("InvalidTag", "A maximum of 10 tags is supported for objects", 400)
try:
storage.set_object_tags(bucket_name, object_key, tags)
except StorageError as exc:
message = str(exc)
if "Bucket" in message:
return _error_response("NoSuchBucket", message, 404)
return _error_response("NoSuchKey", message, 404)
current_app.logger.info("Object tags updated", extra={"bucket": bucket_name, "key": object_key, "tags": len(tags)})
return Response(status=204)
def _sanitize_cors_rules(rules: list[dict[str, Any]]) -> list[dict[str, Any]]:
sanitized: list[dict[str, Any]] = []
for rule in rules:
@@ -823,8 +983,8 @@ def _sanitize_cors_rules(rules: list[dict[str, Any]]) -> list[dict[str, Any]]:
def _bucket_cors_handler(bucket_name: str) -> Response:
if request.method not in {"GET", "PUT"}:
return _method_not_allowed(["GET", "PUT"])
if request.method not in {"GET", "PUT", "DELETE"}:
return _method_not_allowed(["GET", "PUT", "DELETE"])
principal, error = _require_principal()
if error:
return error
@@ -841,6 +1001,14 @@ def _bucket_cors_handler(bucket_name: str) -> Response:
if not rules:
return _error_response("NoSuchCORSConfiguration", "No CORS configuration found", 404)
return _xml_response(_render_cors_document(rules))
if request.method == "DELETE":
try:
storage.set_bucket_cors(bucket_name, None)
except StorageError as exc:
return _error_response("NoSuchBucket", str(exc), 404)
current_app.logger.info("Bucket CORS deleted", extra={"bucket": bucket_name})
return Response(status=204)
# PUT
payload = request.get_data(cache=False) or b""
if not payload.strip():
try:
@@ -907,6 +1075,66 @@ def _bucket_encryption_handler(bucket_name: str) -> Response:
return Response(status=204)
def _bucket_location_handler(bucket_name: str) -> Response:
if request.method != "GET":
return _method_not_allowed(["GET"])
principal, error = _require_principal()
if error:
return error
try:
_authorize_action(principal, bucket_name, "list")
except IamError as exc:
return _error_response("AccessDenied", str(exc), 403)
storage = _storage()
if not storage.bucket_exists(bucket_name):
return _error_response("NoSuchBucket", "Bucket does not exist", 404)
# Return the configured AWS_REGION
region = current_app.config.get("AWS_REGION", "us-east-1")
root = Element("LocationConstraint")
# AWS returns empty for us-east-1, but we'll be explicit
root.text = region if region != "us-east-1" else None
return _xml_response(root)
def _bucket_acl_handler(bucket_name: str) -> Response:
if request.method not in {"GET", "PUT"}:
return _method_not_allowed(["GET", "PUT"])
principal, error = _require_principal()
if error:
return error
try:
_authorize_action(principal, bucket_name, "policy")
except IamError as exc:
return _error_response("AccessDenied", str(exc), 403)
storage = _storage()
if not storage.bucket_exists(bucket_name):
return _error_response("NoSuchBucket", "Bucket does not exist", 404)
if request.method == "PUT":
# We don't fully implement ACLs, but we accept the request for compatibility
# Check for canned ACL header
canned_acl = request.headers.get("x-amz-acl", "private")
current_app.logger.info("Bucket ACL set (canned)", extra={"bucket": bucket_name, "acl": canned_acl})
return Response(status=200)
# GET - Return a basic ACL document showing full control for owner
root = Element("AccessControlPolicy")
owner = SubElement(root, "Owner")
SubElement(owner, "ID").text = principal.access_key if principal else "anonymous"
SubElement(owner, "DisplayName").text = principal.display_name if principal else "Anonymous"
acl = SubElement(root, "AccessControlList")
grant = SubElement(acl, "Grant")
grantee = SubElement(grant, "Grantee")
grantee.set("{http://www.w3.org/2001/XMLSchema-instance}type", "CanonicalUser")
SubElement(grantee, "ID").text = principal.access_key if principal else "anonymous"
SubElement(grantee, "DisplayName").text = principal.display_name if principal else "Anonymous"
SubElement(grant, "Permission").text = "FULL_CONTROL"
return _xml_response(root)
def _bulk_delete_handler(bucket_name: str) -> Response:
principal, error = _require_principal()
if error:
@@ -1067,7 +1295,7 @@ def bucket_handler(bucket_name: str) -> Response:
current_app.logger.info("Bucket deleted", extra={"bucket": bucket_name})
return Response(status=204)
# GET - list objects
# GET - list objects (supports both ListObjects and ListObjectsV2)
principal, error = _require_principal()
try:
_authorize_action(principal, bucket_name, "list")
@@ -1080,16 +1308,131 @@ def bucket_handler(bucket_name: str) -> Response:
except StorageError as exc:
return _error_response("NoSuchBucket", str(exc), 404)
root = Element("ListBucketResult")
SubElement(root, "Name").text = bucket_name
SubElement(root, "MaxKeys").text = str(current_app.config["UI_PAGE_SIZE"])
SubElement(root, "IsTruncated").text = "false"
for meta in objects:
obj_el = SubElement(root, "Contents")
SubElement(obj_el, "Key").text = meta.key
SubElement(obj_el, "LastModified").text = meta.last_modified.isoformat()
SubElement(obj_el, "ETag").text = f'"{meta.etag}"'
SubElement(obj_el, "Size").text = str(meta.size)
# Check if this is ListObjectsV2 (list-type=2)
list_type = request.args.get("list-type")
prefix = request.args.get("prefix", "")
delimiter = request.args.get("delimiter", "")
max_keys = min(int(request.args.get("max-keys", current_app.config["UI_PAGE_SIZE"])), 1000)
# Pagination markers
marker = request.args.get("marker", "") # ListObjects v1
continuation_token = request.args.get("continuation-token", "") # ListObjectsV2
start_after = request.args.get("start-after", "") # ListObjectsV2
# For ListObjectsV2, continuation-token takes precedence, then start-after
# For ListObjects v1, use marker
effective_start = ""
if list_type == "2":
if continuation_token:
import base64
try:
effective_start = base64.urlsafe_b64decode(continuation_token.encode()).decode("utf-8")
except Exception:
effective_start = continuation_token
elif start_after:
effective_start = start_after
else:
effective_start = marker
if prefix:
objects = [obj for obj in objects if obj.key.startswith(prefix)]
if effective_start:
objects = [obj for obj in objects if obj.key > effective_start]
common_prefixes: list[str] = []
filtered_objects: list = []
if delimiter:
seen_prefixes: set[str] = set()
for obj in objects:
key_after_prefix = obj.key[len(prefix):] if prefix else obj.key
if delimiter in key_after_prefix:
# This is a "folder" - extract the common prefix
common_prefix = prefix + key_after_prefix.split(delimiter)[0] + delimiter
if common_prefix not in seen_prefixes:
seen_prefixes.add(common_prefix)
common_prefixes.append(common_prefix)
else:
filtered_objects.append(obj)
objects = filtered_objects
common_prefixes = sorted(common_prefixes)
total_items = len(objects) + len(common_prefixes)
is_truncated = total_items > max_keys
if len(objects) >= max_keys:
objects = objects[:max_keys]
common_prefixes = []
else:
remaining = max_keys - len(objects)
common_prefixes = common_prefixes[:remaining]
next_marker = ""
next_continuation_token = ""
if is_truncated:
if objects:
next_marker = objects[-1].key
elif common_prefixes:
next_marker = common_prefixes[-1].rstrip(delimiter) if delimiter else common_prefixes[-1]
if list_type == "2" and next_marker:
import base64
next_continuation_token = base64.urlsafe_b64encode(next_marker.encode()).decode("utf-8")
if list_type == "2":
root = Element("ListBucketResult")
SubElement(root, "Name").text = bucket_name
SubElement(root, "Prefix").text = prefix
SubElement(root, "MaxKeys").text = str(max_keys)
SubElement(root, "KeyCount").text = str(len(objects) + len(common_prefixes))
SubElement(root, "IsTruncated").text = "true" if is_truncated else "false"
if delimiter:
SubElement(root, "Delimiter").text = delimiter
continuation_token = request.args.get("continuation-token", "")
start_after = request.args.get("start-after", "")
if continuation_token:
SubElement(root, "ContinuationToken").text = continuation_token
if start_after:
SubElement(root, "StartAfter").text = start_after
if is_truncated and next_continuation_token:
SubElement(root, "NextContinuationToken").text = next_continuation_token
for meta in objects:
obj_el = SubElement(root, "Contents")
SubElement(obj_el, "Key").text = meta.key
SubElement(obj_el, "LastModified").text = meta.last_modified.isoformat()
SubElement(obj_el, "ETag").text = f'"{meta.etag}"'
SubElement(obj_el, "Size").text = str(meta.size)
SubElement(obj_el, "StorageClass").text = "STANDARD"
for cp in common_prefixes:
cp_el = SubElement(root, "CommonPrefixes")
SubElement(cp_el, "Prefix").text = cp
else:
root = Element("ListBucketResult")
SubElement(root, "Name").text = bucket_name
SubElement(root, "Prefix").text = prefix
SubElement(root, "Marker").text = marker
SubElement(root, "MaxKeys").text = str(max_keys)
SubElement(root, "IsTruncated").text = "true" if is_truncated else "false"
if delimiter:
SubElement(root, "Delimiter").text = delimiter
if is_truncated and delimiter and next_marker:
SubElement(root, "NextMarker").text = next_marker
for meta in objects:
obj_el = SubElement(root, "Contents")
SubElement(obj_el, "Key").text = meta.key
SubElement(obj_el, "LastModified").text = meta.last_modified.isoformat()
SubElement(obj_el, "ETag").text = f'"{meta.etag}"'
SubElement(obj_el, "Size").text = str(meta.size)
for cp in common_prefixes:
cp_el = SubElement(root, "CommonPrefixes")
SubElement(cp_el, "Prefix").text = cp
return _xml_response(root)
@@ -1099,6 +1442,9 @@ def bucket_handler(bucket_name: str) -> Response:
def object_handler(bucket_name: str, object_key: str):
storage = _storage()
if "tagging" in request.args:
return _object_tagging_handler(bucket_name, object_key)
# Multipart Uploads
if request.method == "POST":
if "uploads" in request.args:
@@ -1111,6 +1457,10 @@ def object_handler(bucket_name: str, object_key: str):
if "partNumber" in request.args and "uploadId" in request.args:
return _upload_part(bucket_name, object_key)
copy_source = request.headers.get("x-amz-copy-source")
if copy_source:
return _copy_object(bucket_name, object_key, copy_source)
_, error = _object_principal("write", bucket_name, object_key)
if error:
return error
@@ -1121,6 +1471,12 @@ def object_handler(bucket_name: str, object_key: str):
stream = AwsChunkedDecoder(stream)
metadata = _extract_request_metadata()
content_type = request.headers.get("Content-Type")
validation_error = _validate_content_type(object_key, content_type)
if validation_error:
return _error_response("InvalidArgument", validation_error, 400)
try:
meta = storage.put_object(
bucket_name,
@@ -1357,6 +1713,88 @@ def head_object(bucket_name: str, object_key: str) -> Response:
return _error_response("AccessDenied", str(exc), 403)
def _copy_object(dest_bucket: str, dest_key: str, copy_source: str) -> Response:
"""Handle S3 CopyObject operation."""
from urllib.parse import unquote
copy_source = unquote(copy_source)
if copy_source.startswith("/"):
copy_source = copy_source[1:]
parts = copy_source.split("/", 1)
if len(parts) != 2:
return _error_response("InvalidArgument", "Invalid x-amz-copy-source format", 400)
source_bucket, source_key = parts
if not source_bucket or not source_key:
return _error_response("InvalidArgument", "Invalid x-amz-copy-source format", 400)
principal, error = _require_principal()
if error:
return error
try:
_authorize_action(principal, source_bucket, "read", object_key=source_key)
except IamError as exc:
return _error_response("AccessDenied", str(exc), 403)
try:
_authorize_action(principal, dest_bucket, "write", object_key=dest_key)
except IamError as exc:
return _error_response("AccessDenied", str(exc), 403)
storage = _storage()
try:
source_path = storage.get_object_path(source_bucket, source_key)
except StorageError:
return _error_response("NoSuchKey", "Source object not found", 404)
source_metadata = storage.get_object_metadata(source_bucket, source_key)
metadata_directive = request.headers.get("x-amz-metadata-directive", "COPY").upper()
if metadata_directive == "REPLACE":
metadata = _extract_request_metadata()
content_type = request.headers.get("Content-Type")
validation_error = _validate_content_type(dest_key, content_type)
if validation_error:
return _error_response("InvalidArgument", validation_error, 400)
else:
metadata = source_metadata
try:
with source_path.open("rb") as stream:
meta = storage.put_object(
dest_bucket,
dest_key,
stream,
metadata=metadata or None,
)
except StorageError as exc:
message = str(exc)
if "Bucket" in message:
return _error_response("NoSuchBucket", message, 404)
return _error_response("InvalidArgument", message, 400)
current_app.logger.info(
"Object copied",
extra={
"source_bucket": source_bucket,
"source_key": source_key,
"dest_bucket": dest_bucket,
"dest_key": dest_key,
"size": meta.size,
},
)
user_agent = request.headers.get("User-Agent", "")
if "S3ReplicationAgent" not in user_agent:
_replication_manager().trigger_replication(dest_bucket, dest_key, action="write")
root = Element("CopyObjectResult")
SubElement(root, "LastModified").text = meta.last_modified.isoformat()
SubElement(root, "ETag").text = f'"{meta.etag}"'
return _xml_response(root)
class AwsChunkedDecoder:
"""Decodes aws-chunked encoded streams."""
def __init__(self, stream):
@@ -1389,12 +1827,11 @@ class AwsChunkedDecoder:
if crlf != b"\r\n":
raise IOError("Malformed chunk: missing CRLF")
else:
# Read chunk size line
line = b""
while True:
char = self.stream.read(1)
if not char:
if not line: # EOF at start of chunk size
if not line:
self.finished = True
return result
raise IOError("Unexpected EOF in chunk size")
@@ -1402,7 +1839,6 @@ class AwsChunkedDecoder:
if line.endswith(b"\r\n"):
break
# Parse chunk size (hex)
try:
line_str = line.decode("ascii").strip()
# Handle chunk-signature extension if present (e.g. "1000;chunk-signature=...")
@@ -1414,7 +1850,6 @@ class AwsChunkedDecoder:
if chunk_size == 0:
self.finished = True
# Read trailers if any (until empty line)
while True:
line = b""
while True:
@@ -1534,13 +1969,11 @@ def _complete_multipart_upload(bucket_name: str, object_key: str) -> Response:
return _error_response("NoSuchUpload", str(exc), 404)
return _error_response("InvalidPart", str(exc), 400)
# Trigger replication
user_agent = request.headers.get("User-Agent", "")
if "S3ReplicationAgent" not in user_agent:
_replication_manager().trigger_replication(bucket_name, object_key, action="write")
root = Element("CompleteMultipartUploadResult")
# Use request.host_url to construct full location
location = f"{request.host_url}{bucket_name}/{object_key}"
SubElement(root, "Location").text = location
SubElement(root, "Bucket").text = bucket_name

View File

@@ -10,10 +10,40 @@ import stat
import time
import unicodedata
import uuid
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, BinaryIO, Dict, List, Optional
from typing import Any, BinaryIO, Dict, Generator, List, Optional
# Platform-specific file locking
if os.name == "nt":
import msvcrt
@contextmanager
def _file_lock(file_handle) -> Generator[None, None, None]:
"""Acquire an exclusive lock on a file (Windows)."""
try:
msvcrt.locking(file_handle.fileno(), msvcrt.LK_NBLCK, 1)
yield
finally:
try:
file_handle.seek(0)
msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
except OSError:
pass
else:
import fcntl # type: ignore
@contextmanager
def _file_lock(file_handle) -> Generator[None, None, None]:
"""Acquire an exclusive lock on a file (Unix)."""
try:
fcntl.flock(file_handle.fileno(), fcntl.LOCK_EX)
yield
finally:
fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
WINDOWS_RESERVED_NAMES = {
"CON",
@@ -119,8 +149,13 @@ class ObjectStorage:
bucket_path.mkdir(parents=True, exist_ok=False)
self._system_bucket_root(bucket_path.name).mkdir(parents=True, exist_ok=True)
def bucket_stats(self, bucket_name: str) -> dict[str, int]:
"""Return object count and total size for the bucket (cached)."""
def bucket_stats(self, bucket_name: str, cache_ttl: int = 60) -> dict[str, int]:
"""Return object count and total size for the bucket (cached).
Args:
bucket_name: Name of the bucket
cache_ttl: Cache time-to-live in seconds (default 60)
"""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise StorageError("Bucket does not exist")
@@ -129,8 +164,8 @@ class ObjectStorage:
cache_path = self._system_bucket_root(bucket_name) / "stats.json"
if cache_path.exists():
try:
# Check if cache is fresh (e.g., < 60 seconds old)
if time.time() - cache_path.stat().st_mtime < 60:
# Check if cache is fresh
if time.time() - cache_path.stat().st_mtime < cache_ttl:
return json.loads(cache_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
@@ -158,6 +193,14 @@ class ObjectStorage:
return stats
def _invalidate_bucket_stats_cache(self, bucket_id: str) -> None:
"""Invalidate the cached bucket statistics."""
cache_path = self._system_bucket_root(bucket_id) / "stats.json"
try:
cache_path.unlink(missing_ok=True)
except OSError:
pass
def delete_bucket(self, bucket_name: str) -> None:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
@@ -228,6 +271,10 @@ class ObjectStorage:
self._write_metadata(bucket_id, safe_key, metadata)
else:
self._delete_metadata(bucket_id, safe_key)
# Invalidate bucket stats cache
self._invalidate_bucket_stats_cache(bucket_id)
return ObjectMeta(
key=safe_key.as_posix(),
size=stat.st_size,
@@ -261,6 +308,10 @@ class ObjectStorage:
rel = path.relative_to(bucket_path)
self._safe_unlink(path)
self._delete_metadata(bucket_id, rel)
# Invalidate bucket stats cache
self._invalidate_bucket_stats_cache(bucket_id)
for parent in path.parents:
if parent == bucket_path:
break
@@ -284,6 +335,10 @@ class ObjectStorage:
legacy_version_dir = self._legacy_version_dir(bucket_id, rel)
if legacy_version_dir.exists():
shutil.rmtree(legacy_version_dir, ignore_errors=True)
# Invalidate bucket stats cache
self._invalidate_bucket_stats_cache(bucket_id)
for parent in target.parents:
if parent == bucket_path:
break
@@ -356,6 +411,74 @@ class ObjectStorage:
bucket_path = self._require_bucket_path(bucket_name)
self._set_bucket_config_entry(bucket_path.name, "encryption", config_payload or None)
# ---------------------- Object tagging helpers ----------------------
def get_object_tags(self, bucket_name: str, object_key: str) -> List[Dict[str, str]]:
"""Get tags for an object."""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise StorageError("Bucket does not exist")
safe_key = self._sanitize_object_key(object_key)
object_path = bucket_path / safe_key
if not object_path.exists():
raise StorageError("Object does not exist")
# Tags are stored in the metadata file alongside user metadata
for meta_file in (self._metadata_file(bucket_path.name, safe_key), self._legacy_metadata_file(bucket_path.name, safe_key)):
if not meta_file.exists():
continue
try:
payload = json.loads(meta_file.read_text(encoding="utf-8"))
tags = payload.get("tags")
if isinstance(tags, list):
return tags
return []
except (OSError, json.JSONDecodeError):
return []
return []
def set_object_tags(self, bucket_name: str, object_key: str, tags: Optional[List[Dict[str, str]]]) -> None:
"""Set tags for an object."""
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
raise StorageError("Bucket does not exist")
safe_key = self._sanitize_object_key(object_key)
object_path = bucket_path / safe_key
if not object_path.exists():
raise StorageError("Object does not exist")
meta_file = self._metadata_file(bucket_path.name, safe_key)
# Read existing metadata
existing_payload: Dict[str, Any] = {}
if meta_file.exists():
try:
existing_payload = json.loads(meta_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
# Update tags
if tags:
existing_payload["tags"] = tags
else:
existing_payload.pop("tags", None)
# Write back if there's anything to store, otherwise delete
if existing_payload.get("metadata") or existing_payload.get("tags"):
meta_file.parent.mkdir(parents=True, exist_ok=True)
meta_file.write_text(json.dumps(existing_payload), encoding="utf-8")
elif meta_file.exists():
meta_file.unlink()
# Clean up empty parent directories
parent = meta_file.parent
meta_root = self._bucket_meta_root(bucket_path.name)
while parent != meta_root and parent.exists() and not any(parent.iterdir()):
parent.rmdir()
parent = parent.parent
def delete_object_tags(self, bucket_name: str, object_key: str) -> None:
"""Delete all tags from an object."""
self.set_object_tags(bucket_name, object_key, None)
def list_object_versions(self, bucket_name: str, object_key: str) -> List[Dict[str, Any]]:
bucket_path = self._bucket_path(bucket_name)
if not bucket_path.exists():
@@ -571,29 +694,49 @@ class ObjectStorage:
safe_key = self._sanitize_object_key(manifest["object_key"])
destination = bucket_path / safe_key
destination.parent.mkdir(parents=True, exist_ok=True)
if self._is_versioning_enabled(bucket_path) and destination.exists():
self._archive_current_version(bucket_id, safe_key, reason="overwrite")
checksum = hashlib.md5()
with destination.open("wb") as target:
for _, record in validated:
part_path = upload_root / record["filename"]
if not part_path.exists():
raise StorageError(f"Missing part file {record['filename']}")
with part_path.open("rb") as chunk:
while True:
data = chunk.read(1024 * 1024)
if not data:
break
checksum.update(data)
target.write(data)
# Use a lock file to prevent concurrent writes to the same destination
lock_file_path = self._system_bucket_root(bucket_id) / "locks" / f"{safe_key.as_posix().replace('/', '_')}.lock"
lock_file_path.parent.mkdir(parents=True, exist_ok=True)
try:
with lock_file_path.open("w") as lock_file:
with _file_lock(lock_file):
if self._is_versioning_enabled(bucket_path) and destination.exists():
self._archive_current_version(bucket_id, safe_key, reason="overwrite")
checksum = hashlib.md5()
with destination.open("wb") as target:
for _, record in validated:
part_path = upload_root / record["filename"]
if not part_path.exists():
raise StorageError(f"Missing part file {record['filename']}")
with part_path.open("rb") as chunk:
while True:
data = chunk.read(1024 * 1024)
if not data:
break
checksum.update(data)
target.write(data)
metadata = manifest.get("metadata")
if metadata:
self._write_metadata(bucket_id, safe_key, metadata)
else:
self._delete_metadata(bucket_id, safe_key)
metadata = manifest.get("metadata")
if metadata:
self._write_metadata(bucket_id, safe_key, metadata)
else:
self._delete_metadata(bucket_id, safe_key)
except BlockingIOError:
raise StorageError("Another upload to this key is in progress")
finally:
# Clean up lock file
try:
lock_file_path.unlink(missing_ok=True)
except OSError:
pass
shutil.rmtree(upload_root, ignore_errors=True)
# Invalidate bucket stats cache
self._invalidate_bucket_stats_cache(bucket_id)
stat = destination.stat()
return ObjectMeta(
key=safe_key.as_posix(),

View File

@@ -249,7 +249,8 @@ def buckets_overview():
if bucket.name not in allowed_names:
continue
policy = policy_store.get_policy(bucket.name)
stats = _storage().bucket_stats(bucket.name)
cache_ttl = current_app.config.get("BUCKET_STATS_CACHE_TTL", 60)
stats = _storage().bucket_stats(bucket.name, cache_ttl=cache_ttl)
access_label, access_badge = _bucket_access_descriptor(policy)
visible_buckets.append({
"meta": bucket,
@@ -335,7 +336,7 @@ def bucket_detail(bucket_name: str):
except IamError:
can_manage_versioning = False
# Replication info
# Replication info - don't compute sync status here (it's slow), let JS fetch it async
replication_rule = _replication().get_rule(bucket_name)
connections = _connections().list()
@@ -1178,8 +1179,12 @@ def update_bucket_replication(bucket_name: str):
_replication().delete_rule(bucket_name)
flash("Replication disabled", "info")
else:
from .replication import REPLICATION_MODE_NEW_ONLY, REPLICATION_MODE_ALL
import time
target_conn_id = request.form.get("target_connection_id")
target_bucket = request.form.get("target_bucket", "").strip()
replication_mode = request.form.get("replication_mode", REPLICATION_MODE_NEW_ONLY)
if not target_conn_id or not target_bucket:
flash("Target connection and bucket are required", "danger")
@@ -1188,14 +1193,50 @@ def update_bucket_replication(bucket_name: str):
bucket_name=bucket_name,
target_connection_id=target_conn_id,
target_bucket=target_bucket,
enabled=True
enabled=True,
mode=replication_mode,
created_at=time.time(),
)
_replication().set_rule(rule)
flash("Replication configured", "success")
# If mode is "all", trigger replication of existing objects
if replication_mode == REPLICATION_MODE_ALL:
_replication().replicate_existing_objects(bucket_name)
flash("Replication configured. Existing objects are being replicated in the background.", "success")
else:
flash("Replication configured. Only new uploads will be replicated.", "success")
return redirect(url_for("ui.bucket_detail", bucket_name=bucket_name, tab="replication"))
@ui_bp.get("/buckets/<bucket_name>/replication/status")
def get_replication_status(bucket_name: str):
"""Async endpoint to fetch replication sync status without blocking page load."""
principal = _current_principal()
try:
_authorize_ui(principal, bucket_name, "read")
except IamError:
return jsonify({"error": "Access denied"}), 403
rule = _replication().get_rule(bucket_name)
if not rule:
return jsonify({"error": "No replication rule"}), 404
# This is the slow operation - compute sync status by comparing buckets
stats = _replication().get_sync_status(bucket_name)
if not stats:
return jsonify({"error": "Failed to compute status"}), 500
return jsonify({
"objects_synced": stats.objects_synced,
"objects_pending": stats.objects_pending,
"objects_orphaned": stats.objects_orphaned,
"bytes_synced": stats.bytes_synced,
"last_sync_at": stats.last_sync_at,
"last_sync_key": stats.last_sync_key,
})
@ui_bp.get("/connections")
def connections_dashboard():
principal = _current_principal()
@@ -1227,8 +1268,9 @@ def metrics_dashboard():
total_bytes_used = 0
# Note: Uses cached stats from storage layer to improve performance
cache_ttl = current_app.config.get("BUCKET_STATS_CACHE_TTL", 60)
for bucket in buckets:
stats = storage.bucket_stats(bucket.name)
stats = storage.bucket_stats(bucket.name, cache_ttl=cache_ttl)
total_objects += stats["objects"]
total_bytes_used += stats["bytes"]