Fix list performance for large buckets: delimiter-aware shallow listing, cache TTL increase, UI delimiter streaming. header badge shows total bucket objects, fix status bar text concatenation

2026-02-26 16:29:28 +08:00
parent 5bf7962c04
commit 1c328ee3af
9 changed files with 387 additions and 118 deletions
--- a/app/init.py
+++ b/app/init.py
@@ -115,7 +115,7 @@ def create_app(

    storage = ObjectStorage(
        Path(app.config["STORAGE_ROOT"]),
-        cache_ttl=app.config.get("OBJECT_CACHE_TTL", 5),
+        cache_ttl=app.config.get("OBJECT_CACHE_TTL", 60),
        object_cache_max_size=app.config.get("OBJECT_CACHE_MAX_SIZE", 100),
        bucket_config_cache_ttl=app.config.get("BUCKET_CONFIG_CACHE_TTL_SECONDS", 30.0),
        object_key_max_length_bytes=app.config.get("OBJECT_KEY_MAX_LENGTH_BYTES", 1024),
--- a/app/config.py
+++ b/app/config.py
@@ -241,7 +241,7 @@ class AppConfig:
        cors_expose_headers = _csv(str(_get("CORS_EXPOSE_HEADERS", "*")), ["*"])
        session_lifetime_days = int(_get("SESSION_LIFETIME_DAYS", 30))
        bucket_stats_cache_ttl = int(_get("BUCKET_STATS_CACHE_TTL", 60))
-        object_cache_ttl = int(_get("OBJECT_CACHE_TTL", 5))
+        object_cache_ttl = int(_get("OBJECT_CACHE_TTL", 60))

        encryption_enabled = str(_get("ENCRYPTION_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
        encryption_keys_dir = storage_root / ".myfsio.sys" / "keys"
--- a/app/encrypted_storage.py
+++ b/app/encrypted_storage.py
@@ -189,6 +189,9 @@ class EncryptedObjectStorage:
    
    def list_objects(self, bucket_name: str, **kwargs):
        return self.storage.list_objects(bucket_name, **kwargs)
+
+    def list_objects_shallow(self, bucket_name: str, **kwargs):
+        return self.storage.list_objects_shallow(bucket_name, **kwargs)
    
    def list_objects_all(self, bucket_name: str):
        return self.storage.list_objects_all(bucket_name)
--- a/app/s3_api.py
+++ b/app/s3_api.py
@@ -2671,54 +2671,43 @@ def bucket_handler(bucket_name: str) -> Response:
    else:
        effective_start = marker
    
-    fetch_keys = max_keys * 10 if delimiter else max_keys
    try:
-        list_result = storage.list_objects(
-            bucket_name,
-            max_keys=fetch_keys,
-            continuation_token=effective_start or None,
-            prefix=prefix or None,
-        )
-        objects = list_result.objects
+        if delimiter:
+            shallow_result = storage.list_objects_shallow(
+                bucket_name,
+                prefix=prefix,
+                delimiter=delimiter,
+                max_keys=max_keys,
+                continuation_token=effective_start or None,
+            )
+            objects = shallow_result.objects
+            common_prefixes = shallow_result.common_prefixes
+            is_truncated = shallow_result.is_truncated
+
+            next_marker = shallow_result.next_continuation_token or ""
+            next_continuation_token = ""
+            if is_truncated and next_marker and list_type == "2":
+                next_continuation_token = base64.urlsafe_b64encode(next_marker.encode()).decode("utf-8")
+        else:
+            list_result = storage.list_objects(
+                bucket_name,
+                max_keys=max_keys,
+                continuation_token=effective_start or None,
+                prefix=prefix or None,
+            )
+            objects = list_result.objects
+            common_prefixes = []
+            is_truncated = list_result.is_truncated
+
+            next_marker = ""
+            next_continuation_token = ""
+            if is_truncated:
+                if objects:
+                    next_marker = objects[-1].key
+                if list_type == "2" and next_marker:
+                    next_continuation_token = base64.urlsafe_b64encode(next_marker.encode()).decode("utf-8")
    except StorageError as exc:
        return _error_response("NoSuchBucket", str(exc), 404)
-    
-    common_prefixes: list[str] = []
-    filtered_objects: list = []
-    if delimiter:
-        seen_prefixes: set[str] = set()
-        for obj in objects:
-            key_after_prefix = obj.key[len(prefix):] if prefix else obj.key
-            if delimiter in key_after_prefix:
-                common_prefix = prefix + key_after_prefix.split(delimiter)[0] + delimiter
-                if common_prefix not in seen_prefixes:
-                    seen_prefixes.add(common_prefix)
-                    common_prefixes.append(common_prefix)
-            else:
-                filtered_objects.append(obj)
-        objects = filtered_objects
-        common_prefixes = sorted(common_prefixes)
-    
-    total_items = len(objects) + len(common_prefixes)
-    is_truncated = total_items > max_keys or list_result.is_truncated
-    
-    if len(objects) >= max_keys:
-        objects = objects[:max_keys]
-        common_prefixes = []
-    else:
-        remaining = max_keys - len(objects)
-        common_prefixes = common_prefixes[:remaining]
-    
-    next_marker = ""
-    next_continuation_token = ""
-    if is_truncated:
-        if objects:
-            next_marker = objects[-1].key
-        elif common_prefixes:
-            next_marker = common_prefixes[-1].rstrip(delimiter) if delimiter else common_prefixes[-1]
-        
-        if list_type == "2" and next_marker:
-            next_continuation_token = base64.urlsafe_b64encode(next_marker.encode()).decode("utf-8")

    if list_type == "2":
        root = Element("ListBucketResult")
--- a/app/s3_client.py
+++ b/app/s3_client.py
@@ -245,6 +245,7 @@ def stream_objects_ndjson(
    url_templates: dict[str, str],
    display_tz: str = "UTC",
    versioning_enabled: bool = False,
+    delimiter: Optional[str] = None,
 ) -> Generator[str, None, None]:
    meta_line = json.dumps({
        "type": "meta",
@@ -258,11 +259,20 @@ def stream_objects_ndjson(
    kwargs: dict[str, Any] = {"Bucket": bucket_name, "MaxKeys": 1000}
    if prefix:
        kwargs["Prefix"] = prefix
+    if delimiter:
+        kwargs["Delimiter"] = delimiter

+    running_count = 0
    try:
        paginator = client.get_paginator("list_objects_v2")
        for page in paginator.paginate(**kwargs):
-            for obj in page.get("Contents", []):
+            for cp in page.get("CommonPrefixes", []):
+                yield json.dumps({
+                    "type": "folder",
+                    "prefix": cp["Prefix"],
+                }) + "\n"
+            page_contents = page.get("Contents", [])
+            for obj in page_contents:
                last_mod = obj["LastModified"]
                yield json.dumps({
                    "type": "object",
@@ -273,6 +283,8 @@ def stream_objects_ndjson(
                    "last_modified_iso": format_datetime_iso(last_mod, display_tz),
                    "etag": obj.get("ETag", "").strip('"'),
                }) + "\n"
+            running_count += len(page_contents)
+            yield json.dumps({"type": "count", "total_count": running_count}) + "\n"
    except ClientError as exc:
        error_msg = exc.response.get("Error", {}).get("Message", "S3 operation failed")
        yield json.dumps({"type": "error", "error": error_msg}) + "\n"
--- a/app/storage.py
+++ b/app/storage.py
@@ -154,6 +154,15 @@ class ListObjectsResult:
    total_count: Optional[int] = None


+@dataclass
+class ShallowListResult:
+    """Result for delimiter-aware directory-level listing."""
+    objects: List[ObjectMeta]
+    common_prefixes: List[str]
+    is_truncated: bool
+    next_continuation_token: Optional[str]
+
+
 def _utcnow() -> datetime:
    return datetime.now(timezone.utc)

@@ -279,25 +288,41 @@ class ObjectStorage:
        version_count = 0
        version_bytes = 0

+        internal = self.INTERNAL_FOLDERS
+        bucket_str = str(bucket_path)
+
        try:
-            for path in bucket_path.rglob("*"):
-                if path.is_file():
-                    rel = path.relative_to(bucket_path)
-                    if not rel.parts:
-                        continue
-                    top_folder = rel.parts[0]
-                    if top_folder not in self.INTERNAL_FOLDERS:
-                        stat = path.stat()
-                        object_count += 1
-                        total_bytes += stat.st_size
+            stack = [bucket_str]
+            while stack:
+                current = stack.pop()
+                try:
+                    with os.scandir(current) as it:
+                        for entry in it:
+                            if current == bucket_str and entry.name in internal:
+                                continue
+                            if entry.is_dir(follow_symlinks=False):
+                                stack.append(entry.path)
+                            elif entry.is_file(follow_symlinks=False):
+                                object_count += 1
+                                total_bytes += entry.stat(follow_symlinks=False).st_size
+                except PermissionError:
+                    continue

            versions_root = self._bucket_versions_root(bucket_name)
            if versions_root.exists():
-                for path in versions_root.rglob("*.bin"):
-                    if path.is_file():
-                        stat = path.stat()
-                        version_count += 1
-                        version_bytes += stat.st_size
+                v_stack = [str(versions_root)]
+                while v_stack:
+                    v_current = v_stack.pop()
+                    try:
+                        with os.scandir(v_current) as it:
+                            for entry in it:
+                                if entry.is_dir(follow_symlinks=False):
+                                    v_stack.append(entry.path)
+                                elif entry.is_file(follow_symlinks=False) and entry.name.endswith(".bin"):
+                                    version_count += 1
+                                    version_bytes += entry.stat(follow_symlinks=False).st_size
+                    except PermissionError:
+                        continue
        except OSError:
            if cached_stats is not None:
                return cached_stats
@@ -471,6 +496,202 @@ class ObjectStorage:
        result = self.list_objects(bucket_name, max_keys=100000)
        return result.objects

+    def list_objects_shallow(
+        self,
+        bucket_name: str,
+        *,
+        prefix: str = "",
+        delimiter: str = "/",
+        max_keys: int = 1000,
+        continuation_token: Optional[str] = None,
+    ) -> ShallowListResult:
+        import bisect
+
+        bucket_path = self._bucket_path(bucket_name)
+        if not bucket_path.exists():
+            raise BucketNotFoundError("Bucket does not exist")
+        bucket_id = bucket_path.name
+
+        if delimiter != "/" or (prefix and not prefix.endswith(delimiter)):
+            return self._shallow_via_full_scan(
+                bucket_name, prefix=prefix, delimiter=delimiter,
+                max_keys=max_keys, continuation_token=continuation_token,
+            )
+
+        target_dir = bucket_path
+        if prefix:
+            safe_prefix_path = Path(prefix.rstrip("/"))
+            if ".." in safe_prefix_path.parts:
+                return ShallowListResult(
+                    objects=[], common_prefixes=[],
+                    is_truncated=False, next_continuation_token=None,
+                )
+            target_dir = bucket_path / safe_prefix_path
+            try:
+                resolved = target_dir.resolve()
+                bucket_resolved = bucket_path.resolve()
+                if not str(resolved).startswith(str(bucket_resolved) + os.sep) and resolved != bucket_resolved:
+                    return ShallowListResult(
+                        objects=[], common_prefixes=[],
+                        is_truncated=False, next_continuation_token=None,
+                    )
+            except (OSError, ValueError):
+                return ShallowListResult(
+                    objects=[], common_prefixes=[],
+                    is_truncated=False, next_continuation_token=None,
+                )
+
+        if not target_dir.exists() or not target_dir.is_dir():
+            return ShallowListResult(
+                objects=[], common_prefixes=[],
+                is_truncated=False, next_continuation_token=None,
+            )
+
+        etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
+        meta_cache: Dict[str, str] = {}
+        if etag_index_path.exists():
+            try:
+                with open(etag_index_path, 'r', encoding='utf-8') as f:
+                    meta_cache = json.load(f)
+            except (OSError, json.JSONDecodeError):
+                pass
+
+        entries_files: list[tuple[str, int, float, Optional[str]]] = []
+        entries_dirs: list[str] = []
+
+        try:
+            with os.scandir(str(target_dir)) as it:
+                for entry in it:
+                    name = entry.name
+                    if name in self.INTERNAL_FOLDERS:
+                        continue
+                    if entry.is_dir(follow_symlinks=False):
+                        cp = prefix + name + delimiter
+                        entries_dirs.append(cp)
+                    elif entry.is_file(follow_symlinks=False):
+                        key = prefix + name
+                        try:
+                            st = entry.stat()
+                            etag = meta_cache.get(key)
+                            entries_files.append((key, st.st_size, st.st_mtime, etag))
+                        except OSError:
+                            pass
+        except OSError:
+            return ShallowListResult(
+                objects=[], common_prefixes=[],
+                is_truncated=False, next_continuation_token=None,
+            )
+
+        entries_dirs.sort()
+        entries_files.sort(key=lambda x: x[0])
+
+        all_items: list[tuple[str, bool]] = []
+        fi, di = 0, 0
+        while fi < len(entries_files) and di < len(entries_dirs):
+            if entries_files[fi][0] <= entries_dirs[di]:
+                all_items.append((entries_files[fi][0], False))
+                fi += 1
+            else:
+                all_items.append((entries_dirs[di], True))
+                di += 1
+        while fi < len(entries_files):
+            all_items.append((entries_files[fi][0], False))
+            fi += 1
+        while di < len(entries_dirs):
+            all_items.append((entries_dirs[di], True))
+            di += 1
+
+        files_map = {e[0]: e for e in entries_files}
+
+        start_index = 0
+        if continuation_token:
+            all_keys = [item[0] for item in all_items]
+            start_index = bisect.bisect_right(all_keys, continuation_token)
+
+        selected = all_items[start_index:start_index + max_keys]
+        is_truncated = (start_index + max_keys) < len(all_items)
+
+        result_objects: list[ObjectMeta] = []
+        result_prefixes: list[str] = []
+        for item_key, is_dir in selected:
+            if is_dir:
+                result_prefixes.append(item_key)
+            else:
+                fdata = files_map[item_key]
+                result_objects.append(ObjectMeta(
+                    key=fdata[0],
+                    size=fdata[1],
+                    last_modified=datetime.fromtimestamp(fdata[2], timezone.utc),
+                    etag=fdata[3],
+                    metadata=None,
+                ))
+
+        next_token = None
+        if is_truncated and selected:
+            next_token = selected[-1][0]
+
+        return ShallowListResult(
+            objects=result_objects,
+            common_prefixes=result_prefixes,
+            is_truncated=is_truncated,
+            next_continuation_token=next_token,
+        )
+
+    def _shallow_via_full_scan(
+        self,
+        bucket_name: str,
+        *,
+        prefix: str = "",
+        delimiter: str = "/",
+        max_keys: int = 1000,
+        continuation_token: Optional[str] = None,
+    ) -> ShallowListResult:
+        list_result = self.list_objects(
+            bucket_name,
+            max_keys=max_keys * 10,
+            continuation_token=continuation_token,
+            prefix=prefix or None,
+        )
+
+        common_prefixes: list[str] = []
+        filtered_objects: list[ObjectMeta] = []
+        seen_prefixes: set[str] = set()
+
+        for obj in list_result.objects:
+            key_after_prefix = obj.key[len(prefix):] if prefix else obj.key
+            if delimiter in key_after_prefix:
+                cp = prefix + key_after_prefix.split(delimiter)[0] + delimiter
+                if cp not in seen_prefixes:
+                    seen_prefixes.add(cp)
+                    common_prefixes.append(cp)
+            else:
+                filtered_objects.append(obj)
+
+        common_prefixes.sort()
+        total_items = len(filtered_objects) + len(common_prefixes)
+        is_truncated = total_items > max_keys or list_result.is_truncated
+
+        if len(filtered_objects) >= max_keys:
+            filtered_objects = filtered_objects[:max_keys]
+            common_prefixes = []
+        else:
+            remaining = max_keys - len(filtered_objects)
+            common_prefixes = common_prefixes[:remaining]
+
+        next_token = None
+        if is_truncated:
+            if filtered_objects:
+                next_token = filtered_objects[-1].key
+            elif common_prefixes:
+                next_token = common_prefixes[-1].rstrip(delimiter) if delimiter else common_prefixes[-1]
+
+        return ShallowListResult(
+            objects=filtered_objects,
+            common_prefixes=common_prefixes,
+            is_truncated=is_truncated,
+            next_continuation_token=next_token,
+        )
+
    def put_object(
        self,
        bucket_name: str,
--- a/app/ui.py
+++ b/app/ui.py
@@ -616,6 +616,7 @@ def stream_bucket_objects(bucket_name: str):
        return jsonify({"error": str(exc)}), 403

    prefix = request.args.get("prefix") or None
+    delimiter = request.args.get("delimiter") or None

    try:
        client = get_session_s3_client()
@@ -629,6 +630,7 @@ def stream_bucket_objects(bucket_name: str):
    return Response(
        stream_objects_ndjson(
            client, bucket_name, prefix, url_templates, display_tz, versioning_enabled,
+            delimiter=delimiter,
        ),
        mimetype='application/x-ndjson',
        headers={