Fix list performance for large buckets: delimiter-aware shallow listing, cache TTL increase, UI delimiter streaming. header badge shows total bucket objects, fix status bar text concatenation

2026-02-26 16:29:28 +08:00
parent 5bf7962c04
commit 1c328ee3af
9 changed files with 387 additions and 118 deletions
--- a/app/storage.py
+++ b/app/storage.py
@@ -154,6 +154,15 @@ class ListObjectsResult:
    total_count: Optional[int] = None


+@dataclass
+class ShallowListResult:
+    """Result for delimiter-aware directory-level listing."""
+    objects: List[ObjectMeta]
+    common_prefixes: List[str]
+    is_truncated: bool
+    next_continuation_token: Optional[str]
+
+
 def _utcnow() -> datetime:
    return datetime.now(timezone.utc)

@@ -279,25 +288,41 @@ class ObjectStorage:
        version_count = 0
        version_bytes = 0

+        internal = self.INTERNAL_FOLDERS
+        bucket_str = str(bucket_path)
+
        try:
-            for path in bucket_path.rglob("*"):
-                if path.is_file():
-                    rel = path.relative_to(bucket_path)
-                    if not rel.parts:
-                        continue
-                    top_folder = rel.parts[0]
-                    if top_folder not in self.INTERNAL_FOLDERS:
-                        stat = path.stat()
-                        object_count += 1
-                        total_bytes += stat.st_size
+            stack = [bucket_str]
+            while stack:
+                current = stack.pop()
+                try:
+                    with os.scandir(current) as it:
+                        for entry in it:
+                            if current == bucket_str and entry.name in internal:
+                                continue
+                            if entry.is_dir(follow_symlinks=False):
+                                stack.append(entry.path)
+                            elif entry.is_file(follow_symlinks=False):
+                                object_count += 1
+                                total_bytes += entry.stat(follow_symlinks=False).st_size
+                except PermissionError:
+                    continue

            versions_root = self._bucket_versions_root(bucket_name)
            if versions_root.exists():
-                for path in versions_root.rglob("*.bin"):
-                    if path.is_file():
-                        stat = path.stat()
-                        version_count += 1
-                        version_bytes += stat.st_size
+                v_stack = [str(versions_root)]
+                while v_stack:
+                    v_current = v_stack.pop()
+                    try:
+                        with os.scandir(v_current) as it:
+                            for entry in it:
+                                if entry.is_dir(follow_symlinks=False):
+                                    v_stack.append(entry.path)
+                                elif entry.is_file(follow_symlinks=False) and entry.name.endswith(".bin"):
+                                    version_count += 1
+                                    version_bytes += entry.stat(follow_symlinks=False).st_size
+                    except PermissionError:
+                        continue
        except OSError:
            if cached_stats is not None:
                return cached_stats
@@ -471,6 +496,202 @@ class ObjectStorage:
        result = self.list_objects(bucket_name, max_keys=100000)
        return result.objects

+    def list_objects_shallow(
+        self,
+        bucket_name: str,
+        *,
+        prefix: str = "",
+        delimiter: str = "/",
+        max_keys: int = 1000,
+        continuation_token: Optional[str] = None,
+    ) -> ShallowListResult:
+        import bisect
+
+        bucket_path = self._bucket_path(bucket_name)
+        if not bucket_path.exists():
+            raise BucketNotFoundError("Bucket does not exist")
+        bucket_id = bucket_path.name
+
+        if delimiter != "/" or (prefix and not prefix.endswith(delimiter)):
+            return self._shallow_via_full_scan(
+                bucket_name, prefix=prefix, delimiter=delimiter,
+                max_keys=max_keys, continuation_token=continuation_token,
+            )
+
+        target_dir = bucket_path
+        if prefix:
+            safe_prefix_path = Path(prefix.rstrip("/"))
+            if ".." in safe_prefix_path.parts:
+                return ShallowListResult(
+                    objects=[], common_prefixes=[],
+                    is_truncated=False, next_continuation_token=None,
+                )
+            target_dir = bucket_path / safe_prefix_path
+            try:
+                resolved = target_dir.resolve()
+                bucket_resolved = bucket_path.resolve()
+                if not str(resolved).startswith(str(bucket_resolved) + os.sep) and resolved != bucket_resolved:
+                    return ShallowListResult(
+                        objects=[], common_prefixes=[],
+                        is_truncated=False, next_continuation_token=None,
+                    )
+            except (OSError, ValueError):
+                return ShallowListResult(
+                    objects=[], common_prefixes=[],
+                    is_truncated=False, next_continuation_token=None,
+                )
+
+        if not target_dir.exists() or not target_dir.is_dir():
+            return ShallowListResult(
+                objects=[], common_prefixes=[],
+                is_truncated=False, next_continuation_token=None,
+            )
+
+        etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
+        meta_cache: Dict[str, str] = {}
+        if etag_index_path.exists():
+            try:
+                with open(etag_index_path, 'r', encoding='utf-8') as f:
+                    meta_cache = json.load(f)
+            except (OSError, json.JSONDecodeError):
+                pass
+
+        entries_files: list[tuple[str, int, float, Optional[str]]] = []
+        entries_dirs: list[str] = []
+
+        try:
+            with os.scandir(str(target_dir)) as it:
+                for entry in it:
+                    name = entry.name
+                    if name in self.INTERNAL_FOLDERS:
+                        continue
+                    if entry.is_dir(follow_symlinks=False):
+                        cp = prefix + name + delimiter
+                        entries_dirs.append(cp)
+                    elif entry.is_file(follow_symlinks=False):
+                        key = prefix + name
+                        try:
+                            st = entry.stat()
+                            etag = meta_cache.get(key)
+                            entries_files.append((key, st.st_size, st.st_mtime, etag))
+                        except OSError:
+                            pass
+        except OSError:
+            return ShallowListResult(
+                objects=[], common_prefixes=[],
+                is_truncated=False, next_continuation_token=None,
+            )
+
+        entries_dirs.sort()
+        entries_files.sort(key=lambda x: x[0])
+
+        all_items: list[tuple[str, bool]] = []
+        fi, di = 0, 0
+        while fi < len(entries_files) and di < len(entries_dirs):
+            if entries_files[fi][0] <= entries_dirs[di]:
+                all_items.append((entries_files[fi][0], False))
+                fi += 1
+            else:
+                all_items.append((entries_dirs[di], True))
+                di += 1
+        while fi < len(entries_files):
+            all_items.append((entries_files[fi][0], False))
+            fi += 1
+        while di < len(entries_dirs):
+            all_items.append((entries_dirs[di], True))
+            di += 1
+
+        files_map = {e[0]: e for e in entries_files}
+
+        start_index = 0
+        if continuation_token:
+            all_keys = [item[0] for item in all_items]
+            start_index = bisect.bisect_right(all_keys, continuation_token)
+
+        selected = all_items[start_index:start_index + max_keys]
+        is_truncated = (start_index + max_keys) < len(all_items)
+
+        result_objects: list[ObjectMeta] = []
+        result_prefixes: list[str] = []
+        for item_key, is_dir in selected:
+            if is_dir:
+                result_prefixes.append(item_key)
+            else:
+                fdata = files_map[item_key]
+                result_objects.append(ObjectMeta(
+                    key=fdata[0],
+                    size=fdata[1],
+                    last_modified=datetime.fromtimestamp(fdata[2], timezone.utc),
+                    etag=fdata[3],
+                    metadata=None,
+                ))
+
+        next_token = None
+        if is_truncated and selected:
+            next_token = selected[-1][0]
+
+        return ShallowListResult(
+            objects=result_objects,
+            common_prefixes=result_prefixes,
+            is_truncated=is_truncated,
+            next_continuation_token=next_token,
+        )
+
+    def _shallow_via_full_scan(
+        self,
+        bucket_name: str,
+        *,
+        prefix: str = "",
+        delimiter: str = "/",
+        max_keys: int = 1000,
+        continuation_token: Optional[str] = None,
+    ) -> ShallowListResult:
+        list_result = self.list_objects(
+            bucket_name,
+            max_keys=max_keys * 10,
+            continuation_token=continuation_token,
+            prefix=prefix or None,
+        )
+
+        common_prefixes: list[str] = []
+        filtered_objects: list[ObjectMeta] = []
+        seen_prefixes: set[str] = set()
+
+        for obj in list_result.objects:
+            key_after_prefix = obj.key[len(prefix):] if prefix else obj.key
+            if delimiter in key_after_prefix:
+                cp = prefix + key_after_prefix.split(delimiter)[0] + delimiter
+                if cp not in seen_prefixes:
+                    seen_prefixes.add(cp)
+                    common_prefixes.append(cp)
+            else:
+                filtered_objects.append(obj)
+
+        common_prefixes.sort()
+        total_items = len(filtered_objects) + len(common_prefixes)
+        is_truncated = total_items > max_keys or list_result.is_truncated
+
+        if len(filtered_objects) >= max_keys:
+            filtered_objects = filtered_objects[:max_keys]
+            common_prefixes = []
+        else:
+            remaining = max_keys - len(filtered_objects)
+            common_prefixes = common_prefixes[:remaining]
+
+        next_token = None
+        if is_truncated:
+            if filtered_objects:
+                next_token = filtered_objects[-1].key
+            elif common_prefixes:
+                next_token = common_prefixes[-1].rstrip(delimiter) if delimiter else common_prefixes[-1]
+
+        return ShallowListResult(
+            objects=filtered_objects,
+            common_prefixes=common_prefixes,
+            is_truncated=is_truncated,
+            next_continuation_token=next_token,
+        )
+
    def put_object(
        self,
        bucket_name: str,