Bypass boto3 proxy for object streaming, read directly from storage layer; Add streaming object iterator to eliminate O(n²) directory rescanning on large buckets; Add iter_objects_shallow delegation to EncryptedObjectStorage
This commit is contained in:
@@ -193,6 +193,9 @@ class EncryptedObjectStorage:
|
|||||||
def list_objects_shallow(self, bucket_name: str, **kwargs):
|
def list_objects_shallow(self, bucket_name: str, **kwargs):
|
||||||
return self.storage.list_objects_shallow(bucket_name, **kwargs)
|
return self.storage.list_objects_shallow(bucket_name, **kwargs)
|
||||||
|
|
||||||
|
def iter_objects_shallow(self, bucket_name: str, **kwargs):
|
||||||
|
return self.storage.iter_objects_shallow(bucket_name, **kwargs)
|
||||||
|
|
||||||
def search_objects(self, bucket_name: str, query: str, **kwargs):
|
def search_objects(self, bucket_name: str, query: str, **kwargs):
|
||||||
return self.storage.search_objects(bucket_name, query, **kwargs)
|
return self.storage.search_objects(bucket_name, query, **kwargs)
|
||||||
|
|
||||||
|
|||||||
@@ -714,6 +714,73 @@ class ObjectStorage:
|
|||||||
next_continuation_token=next_token,
|
next_continuation_token=next_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def iter_objects_shallow(
|
||||||
|
self,
|
||||||
|
bucket_name: str,
|
||||||
|
*,
|
||||||
|
prefix: str = "",
|
||||||
|
delimiter: str = "/",
|
||||||
|
) -> Generator[tuple[str, ObjectMeta | str], None, None]:
|
||||||
|
bucket_path = self._bucket_path(bucket_name)
|
||||||
|
if not bucket_path.exists():
|
||||||
|
raise BucketNotFoundError("Bucket does not exist")
|
||||||
|
bucket_id = bucket_path.name
|
||||||
|
|
||||||
|
target_dir = bucket_path
|
||||||
|
if prefix:
|
||||||
|
safe_prefix_path = Path(prefix.rstrip("/"))
|
||||||
|
if ".." in safe_prefix_path.parts:
|
||||||
|
return
|
||||||
|
target_dir = bucket_path / safe_prefix_path
|
||||||
|
try:
|
||||||
|
resolved = target_dir.resolve()
|
||||||
|
bucket_resolved = bucket_path.resolve()
|
||||||
|
if not str(resolved).startswith(str(bucket_resolved) + os.sep) and resolved != bucket_resolved:
|
||||||
|
return
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return
|
||||||
|
|
||||||
|
if not target_dir.exists() or not target_dir.is_dir():
|
||||||
|
return
|
||||||
|
|
||||||
|
etag_index_path = self._system_bucket_root(bucket_id) / "etag_index.json"
|
||||||
|
meta_cache: Dict[str, str] = {}
|
||||||
|
if etag_index_path.exists():
|
||||||
|
try:
|
||||||
|
with open(etag_index_path, 'r', encoding='utf-8') as f:
|
||||||
|
meta_cache = json.load(f)
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
with os.scandir(str(target_dir)) as it:
|
||||||
|
for entry in it:
|
||||||
|
name = entry.name
|
||||||
|
if name in self.INTERNAL_FOLDERS:
|
||||||
|
continue
|
||||||
|
if entry.is_dir(follow_symlinks=False):
|
||||||
|
yield ("folder", prefix + name + delimiter)
|
||||||
|
elif entry.is_file(follow_symlinks=False):
|
||||||
|
key = prefix + name
|
||||||
|
try:
|
||||||
|
st = entry.stat()
|
||||||
|
etag = meta_cache.get(key)
|
||||||
|
if etag is None:
|
||||||
|
safe_key = PurePosixPath(key)
|
||||||
|
meta = self._read_metadata(bucket_id, Path(safe_key))
|
||||||
|
etag = meta.get("__etag__") if meta else None
|
||||||
|
yield ("object", ObjectMeta(
|
||||||
|
key=key,
|
||||||
|
size=st.st_size,
|
||||||
|
last_modified=datetime.fromtimestamp(st.st_mtime, timezone.utc),
|
||||||
|
etag=etag,
|
||||||
|
metadata=None,
|
||||||
|
))
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
except OSError:
|
||||||
|
return
|
||||||
|
|
||||||
def _shallow_via_full_scan(
|
def _shallow_via_full_scan(
|
||||||
self,
|
self,
|
||||||
bucket_name: str,
|
bucket_name: str,
|
||||||
|
|||||||
75
app/ui.py
75
app/ui.py
@@ -618,20 +618,77 @@ def stream_bucket_objects(bucket_name: str):
|
|||||||
prefix = request.args.get("prefix") or None
|
prefix = request.args.get("prefix") or None
|
||||||
delimiter = request.args.get("delimiter") or None
|
delimiter = request.args.get("delimiter") or None
|
||||||
|
|
||||||
|
storage = _storage()
|
||||||
try:
|
try:
|
||||||
client = get_session_s3_client()
|
versioning_enabled = storage.is_versioning_enabled(bucket_name)
|
||||||
except (PermissionError, RuntimeError) as exc:
|
except StorageError:
|
||||||
return jsonify({"error": str(exc)}), 403
|
versioning_enabled = False
|
||||||
|
|
||||||
versioning_enabled = get_versioning_via_s3(client, bucket_name)
|
|
||||||
url_templates = build_url_templates(bucket_name)
|
url_templates = build_url_templates(bucket_name)
|
||||||
display_tz = current_app.config.get("DISPLAY_TIMEZONE", "UTC")
|
display_tz = current_app.config.get("DISPLAY_TIMEZONE", "UTC")
|
||||||
|
|
||||||
|
def generate():
|
||||||
|
yield json.dumps({
|
||||||
|
"type": "meta",
|
||||||
|
"versioning_enabled": versioning_enabled,
|
||||||
|
"url_templates": url_templates,
|
||||||
|
}) + "\n"
|
||||||
|
yield json.dumps({"type": "count", "total_count": 0}) + "\n"
|
||||||
|
|
||||||
|
running_count = 0
|
||||||
|
try:
|
||||||
|
if delimiter:
|
||||||
|
for item_type, item in storage.iter_objects_shallow(
|
||||||
|
bucket_name, prefix=prefix or "", delimiter=delimiter,
|
||||||
|
):
|
||||||
|
if item_type == "folder":
|
||||||
|
yield json.dumps({"type": "folder", "prefix": item}) + "\n"
|
||||||
|
else:
|
||||||
|
last_mod = item.last_modified
|
||||||
|
yield json.dumps({
|
||||||
|
"type": "object",
|
||||||
|
"key": item.key,
|
||||||
|
"size": item.size,
|
||||||
|
"last_modified": last_mod.isoformat(),
|
||||||
|
"last_modified_display": _format_datetime_display(last_mod, display_tz),
|
||||||
|
"last_modified_iso": _format_datetime_iso(last_mod, display_tz),
|
||||||
|
"etag": item.etag or "",
|
||||||
|
}) + "\n"
|
||||||
|
running_count += 1
|
||||||
|
if running_count % 1000 == 0:
|
||||||
|
yield json.dumps({"type": "count", "total_count": running_count}) + "\n"
|
||||||
|
else:
|
||||||
|
continuation_token = None
|
||||||
|
while True:
|
||||||
|
result = storage.list_objects(
|
||||||
|
bucket_name,
|
||||||
|
max_keys=1000,
|
||||||
|
continuation_token=continuation_token,
|
||||||
|
prefix=prefix,
|
||||||
|
)
|
||||||
|
for obj in result.objects:
|
||||||
|
last_mod = obj.last_modified
|
||||||
|
yield json.dumps({
|
||||||
|
"type": "object",
|
||||||
|
"key": obj.key,
|
||||||
|
"size": obj.size,
|
||||||
|
"last_modified": last_mod.isoformat(),
|
||||||
|
"last_modified_display": _format_datetime_display(last_mod, display_tz),
|
||||||
|
"last_modified_iso": _format_datetime_iso(last_mod, display_tz),
|
||||||
|
"etag": obj.etag or "",
|
||||||
|
}) + "\n"
|
||||||
|
running_count += len(result.objects)
|
||||||
|
yield json.dumps({"type": "count", "total_count": running_count}) + "\n"
|
||||||
|
if not result.is_truncated:
|
||||||
|
break
|
||||||
|
continuation_token = result.next_continuation_token
|
||||||
|
except StorageError as exc:
|
||||||
|
yield json.dumps({"type": "error", "error": str(exc)}) + "\n"
|
||||||
|
return
|
||||||
|
yield json.dumps({"type": "count", "total_count": running_count}) + "\n"
|
||||||
|
yield json.dumps({"type": "done"}) + "\n"
|
||||||
|
|
||||||
return Response(
|
return Response(
|
||||||
stream_objects_ndjson(
|
generate(),
|
||||||
client, bucket_name, prefix, url_templates, display_tz, versioning_enabled,
|
|
||||||
delimiter=delimiter,
|
|
||||||
),
|
|
||||||
mimetype='application/x-ndjson',
|
mimetype='application/x-ndjson',
|
||||||
headers={
|
headers={
|
||||||
'Cache-Control': 'no-cache',
|
'Cache-Control': 'no-cache',
|
||||||
|
|||||||
Reference in New Issue
Block a user