238 Commits

Author SHA1 Message Date
ad7b2a02cb Add missing endpoints for Rust S3 API 2026-04-05 15:22:24 +08:00
72ddd9822c Add docker support for rust integration 2026-04-03 12:31:11 +08:00
4c30efd802 Update myfsio rust engines - added more implementations 2026-04-02 21:57:16 +08:00
926a7e6366 Add Rust storage engine foundation 2026-04-02 17:00:58 +08:00
1eadc7b75c Fix more-actions dropdown positioning: use Popper fixed strategy instead of raw CSS position:fixed 2026-04-01 16:24:42 +08:00
4a224a127b Fix more-actions dropdown triggering row selection on object list 2026-04-01 16:17:29 +08:00
c498fe7aee Add self-heal missing ETags and harden ETag index persistence 2026-03-31 21:10:47 +08:00
3838aed954 Fix presigned URL security vulnerabilities: enforce key/user status in SigV4 paths, remove duplicate verification, remove X-Forwarded-Host trust 2026-03-31 20:27:18 +08:00
6a193dbb1c Add --version option for run.py 2026-03-31 17:21:33 +08:00
e94b341a5b Add robust myfsio_core staleness detection with Python fallback; document Rust extension build in README 2026-03-31 17:13:05 +08:00
2ad3736852 Add intra-bucket cursor tracking to integrity scanner for progressive full coverage; Optimize integrity scanner: early batch exit, lazy sorted walk, cursor-aware index reads 2026-03-31 17:04:28 +08:00
f05b2668c0 Reduce per-request overhead: pre-compile SigV4 regex, in-memory etag index cache, 1MB GET chunks, configurable meta cache, skip fsync for rebuildable caches 2026-03-25 13:44:34 +08:00
f7c1c1f809 Update requirements.txt 2026-03-25 13:26:42 +08:00
0e392e18b4 Hide ghost details in object panel when preview fails to load 2026-03-24 15:15:03 +08:00
8996f1ce06 Fix folder selection not showing delete button in bucket browser 2026-03-24 12:10:38 +08:00
f60dbaf9c9 Respect DISPLAY_TIMEZONE in GC and integrity scanner history tables 2026-03-23 18:36:13 +08:00
1a5a7aa9e1 Auto-refresh Recent Scans/Executions tables after GC and integrity scan completion 2026-03-23 18:31:13 +08:00
326367ae4c Fix integrity scanner batch limit and add cursor-based rotation 2026-03-23 17:46:27 +08:00
a7f9b0a22f Convert GC to async with polling to prevent proxy timeouts 2026-03-23 17:14:04 +08:00
0e525713b1 Fix missing CSRF token on presigned URL request 2026-03-23 16:48:25 +08:00
f43fad02fb Replace fetch with XHR for multipart upload progress and add retry logic 2026-03-23 16:27:28 +08:00
eff3e378f3 Fix mobile infinite scroll on object list and ghost preview on fast object swap 2026-03-23 11:55:46 +08:00
5e32cef792 Add I/O throttling to GC and integrity scanner to prevent HDD starvation 2026-03-23 11:36:38 +08:00
9898167f8d Make integrity scan async with progress indicator in UI 2026-03-22 14:17:43 +08:00
4a553555d3 Clean up debug code 2026-03-22 11:38:29 +08:00
7a3202c996 Possible fix for the issue 2026-03-22 11:27:52 +08:00
bd20ca86ab Further debugging on s3 api issues on Granian 2026-03-22 11:22:24 +08:00
532cf95d59 Debug s3 api issues on Granian 2026-03-22 11:14:32 +08:00
366f8ce60d the middleware now also triggers when Content-Length is '0' but X-Amz-Decoded-Content-Length or aws-chunked headers indicate a body should be present 2026-03-22 00:24:04 +08:00
7612cb054a further fixes 2026-03-22 00:16:30 +08:00
966d524dca Fix 0-byte uploads caused by Granian stripping Expect header and missing CONTENT_LENGTH for chunked transfers 2026-03-22 00:04:55 +08:00
e84f1f1851 Fix SigV4 SignatureDoesNotMatch when Expect header is stripped by WSGI server 2026-03-21 23:48:19 +08:00
a059f0502d Fix 0-byte uploads caused by Granian default buffer size; Add SERVER_MAX_BUFFER_SIZE config 2026-03-21 22:57:48 +08:00
afd7173ba0 Fix buttons all showing Running state when only one action is triggered 2026-03-21 14:51:43 +08:00
c807bb2388 Update install/uninstall scripts for encrypted IAM config 2026-03-20 17:51:00 +08:00
aa4f9f5566 Bypass boto3 proxy for object streaming, read directly from storage layer; Add streaming object iterator to eliminate O(n²) directory rescanning on large buckets; Add iter_objects_shallow delegation to EncryptedObjectStorage 2026-03-20 17:35:10 +08:00
14786151e5 Fix selected object losing highlight on scroll in virtual list 2026-03-20 12:10:26 +08:00
a496862902 Fix stale object count on dashboard after deleting all objects in bucket 2026-03-17 23:25:30 +08:00
df4f27ca2e Fix IAM policy editor injecting prefix on existing policies without one 2026-03-15 16:04:35 +08:00
d72e0a347e Overhaul IAM: granular actions, multi-key users, prefix-scoped policies 2026-03-14 23:50:44 +08:00
6ed4b7d8ea Add System page: server info, feature flags, GC and integrity scanner UI 2026-03-14 20:27:57 +08:00
31ebbea680 Fix Docker healthcheck failure: Granian cannot run inside daemon process 2026-03-14 18:31:12 +08:00
d878134ebf Switch from Waitress to Granian (Rust/hyper WSGI server) for improved concurrency 2026-03-14 18:17:39 +08:00
55568d6892 Fix video seekbar in static website hosting by adding HTTP Range request support 2026-03-10 22:21:55 +08:00
a4ae81c77c Add integrity scanner: background detection and healing of corrupted objects, orphaned files, phantom metadata, stale versions, etag cache inconsistencies, and legacy metadata drift 2026-03-10 22:14:39 +08:00
9da7104887 Redesign tags UI: split pills, grid editor with column headers, ghost delete buttons 2026-03-10 17:48:17 +08:00
de5377e5ac Add garbage collection: background cleanup of orphaned temp files, multipart uploads, lock files, metadata, versions, and empty directories 2026-03-09 17:34:21 +08:00
80b77b64eb Fix bucket dashboard missing created date and incorrect object count badge in folder view 2026-03-09 15:27:08 +08:00
6c912a3d71 Add conditional GET/HEAD headers: If-Match, If-None-Match, If-Modified-Since, If-Unmodified-Since 2026-03-09 15:09:15 +08:00
c6e368324a Update docs.md and docs.html for credential expiry, IAM encryption, admin key env vars, and --reset-cred 2026-03-08 13:38:44 +08:00
7b6c096bb7 Remove the check out the documentation paragraph at login page 2026-03-08 13:18:03 +08:00
03353a0aec Add credential expiry support: per-user expires_at with UI management, presets, and badge indicators; Add credential expiry support: per-user expires_at with UI management, presets, and badge indicators; Fix IAM card dropdown clipped by overflow: remove gradient bar, allow overflow visible 2026-03-08 13:08:57 +08:00
72f5d9d70c Restore data integrity guarantees: Content-MD5 validation, fsync durability, atomic metadata writes, concurrent write protection 2026-03-07 17:54:00 +08:00
be63e27c15 Reduce per-request CPU overhead: eliminate double stat(), cache content type and policy context, gate logging, configurable stat intervals 2026-03-07 14:08:23 +08:00
81ef0fe4c7 Fix stale object count in bucket header and metrics dashboard after deletes 2026-03-03 19:42:37 +08:00
5f24bd920d Reduce P99 tail latency: defer etag index writes, eliminate double cache rebuild, skip redundant stat() in bucket config 2026-03-02 22:39:37 +08:00
8552f193de Reduce CPU/lock contention under concurrent uploads: split cache lock, in-memory stats, dict copy, lightweight request IDs, defaultdict metrics 2026-03-02 22:05:54 +08:00
5536330aeb Move performance-critical Python functions to Rust: streaming I/O, multipart assembly, and AES-256-GCM encryption 2026-02-27 22:55:20 +08:00
d4657c389d Fix misleading default credentials in README to match actual random generation behavior 2026-02-27 21:58:10 +08:00
3827235232 Reduce CPU usage on heavy uploads: skip SHA256 body hashing in SigV4, use Rust md5_file post-write instead of per-chunk _HashingReader 2026-02-27 21:57:13 +08:00
dfc0058d0d Extend myfsio_core Rust extension with 7 storage hot paths (directory scanning, metadata I/O, object listing, search, bucket stats, cache building) 2026-02-27 12:22:39 +08:00
27aef84311 Fix rclone CopyObject SignatureDoesNotMatch caused by internal metadata leaking as X-Amz-Meta headers 2026-02-26 21:39:43 +08:00
5003514a3d Fix null ETags in shallow listing by updating etag index on store/delete 2026-02-26 18:09:08 +08:00
20a314e030 Fix incorrect Upgrading & Updates section in Docs 2026-02-26 17:49:59 +08:00
d8232340c3 Update docs 2026-02-26 17:38:44 +08:00
a356bb0c4e perf: shallow listing, os.scandir stats, server-side search for large buckets 2026-02-26 17:11:07 +08:00
1c328ee3af Fix list performance for large buckets: delimiter-aware shallow listing, cache TTL increase, UI delimiter streaming. header badge shows total bucket objects, fix status bar text concatenation 2026-02-26 16:29:28 +08:00
5bf7962c04 Fix UI: versioning modals and object browser panel showing 'null' 2026-02-24 20:41:39 +08:00
e06f653606 Fix version panel showing 'null' instead of timestamp, exclude current version from list, auto-refresh versions after upload 2026-02-24 17:19:12 +08:00
9c2809c195 Backwards compatibility for Proxy trust config 2026-02-22 18:03:38 +08:00
fb32ca0a7d Harden security: fail-closed policies, presigned URL time/expiry validation, SSRF DNS pinning, lockout cap, proxy trust config 2026-02-22 17:55:40 +08:00
6ab702a818 Use cached etag in HEAD instead of re-hashing entire file 2026-02-22 16:01:46 +08:00
550e7d435c Move SigV4 canonical request construction to Rust unified verify function 2026-02-22 14:03:12 +08:00
776967e80d Add Rust index reader, metadata read cache, and 256KB stream chunks 2026-02-19 23:01:40 +08:00
082a7fbcd1 Move index JSON read to Rust for GIL-released parsing (serde_json) 2026-02-19 22:43:28 +08:00
ff287cf67b Improve Sites page UI/UX: dropdown actions, collapsible forms, AJAX submissions, Check All Health, safer selectors 2026-02-16 22:04:46 +08:00
bddf36d52d Fix domain mapping cross-process staleness, filter bucket dropdown to website-enabled only 2026-02-16 17:48:21 +08:00
cf6cec9cab Add 5 missing S3 API operations: DeleteBucketEncryption, GetObjectAcl, PutObjectAcl, GetObjectAttributes, GetBucketPolicyStatus 2026-02-16 16:41:27 +08:00
d425839e57 Remove Rust build artifacts from tracking, update .gitignore 2026-02-16 16:06:42 +08:00
4c661477d5 Add Rust extension module (myfsio_core) for SigV4, hashing, and validation hot paths 2026-02-16 16:04:15 +08:00
f3f52f14a5 Fix domain mapping bugs and improve UI/UX: normalize domains, fix delete, add validation and search 2026-02-16 00:51:19 +08:00
d19ba3e305 UI/UX enhancements to IAM page: role badges, search, copy keys, improved policy display 2026-02-16 00:40:04 +08:00
c627f41f53 UI/UX enhancements to Metrics page 2026-02-15 23:56:18 +08:00
bcad0cd3da Improve web UI: sort/search/context menu, fix security and UX bugs 2026-02-15 23:30:26 +08:00
67f057ca1c Add static website hosting 2026-02-15 20:57:02 +08:00
01e79e6993 Fix object browser UI issues 2026-02-10 11:41:02 +08:00
1e3c4b545f Migrate UI backend from direct storage calls to S3 API proxy via boto3 2026-02-09 22:33:47 +08:00
4ecd32a554 Fix empty UI on large bucket first load: keep loading row during streaming, add progress indicator, throttle renders 2026-02-09 19:29:50 +08:00
aa6d7c4d28 Optimize replication failure caching, batch UI auth checks, add bulk download size limit, background parent cleanup 2026-02-09 18:23:45 +08:00
6e6d6d32bf Optimize KMS: cache AESGCM instance, remove duplicate get_provider 2026-02-09 17:01:19 +08:00
54705ab9c4 Fix Content-Length mismatch on range requests (206 Partial Content) 2026-02-06 16:14:35 +08:00
77a46d0725 Binary run fix 2026-02-05 23:49:36 +08:00
0f750b9d89 Optimize object browser for large listings on slow networks 2026-02-05 22:56:00 +08:00
e0dee9db36 Fix UI object browser not showing objects uploaded via S3 API 2026-02-05 22:22:59 +08:00
126657c99f Further debugging of object browser object count delay 2026-02-05 21:45:02 +08:00
07fb1ac773 Fix cross-process cache invalidation on Windows using version counter instead of mtime 2026-02-05 21:32:40 +08:00
147962e1dd Further debugging of object browser object count delay 2026-02-05 21:18:35 +08:00
2643a79121 Debug object browser object count delay 2026-02-05 21:08:18 +08:00
e9a035827b Add _touch_cache_marker for UI object delay count issue 2026-02-05 20:56:42 +08:00
033b8a82be Fix error handlers for API mode; distinguish files from directories in object lookup; Fix UI not showing newly uploaded objects by adding Cache-Control headers 2026-02-05 20:44:11 +08:00
e76c311231 Update install/uninstall scripts with new config options and credential capture 2026-02-05 19:21:18 +08:00
cbdf1a27c8 Pin dockerfile python version to 3.14.3 2026-02-05 19:11:42 +08:00
4a60cb269a Update python version in Dockerfile 2026-02-05 19:11:00 +08:00
ebe7f6222d Fix hardcoded secret key ttl session 2026-02-05 19:08:18 +08:00
70b61fd8e6 Further optimize CPU usage; Improve security and performance; 4 bug fixes. 2026-02-05 17:45:34 +08:00
a779b002d7 Optimize CPU usage via caching and reducing ThreadPoolExecutor workers to prevent CPU saturation 2026-02-02 13:30:06 +08:00
45d21cce21 Add ALLOW_INTERNAL_ENDPOINTS config for self-hosted internal network deployments 2026-02-01 18:26:14 +08:00
9629507acd Fix auth bypass, user enumeration, xml DoS, multipart race, path traversal unicode, silent permissions failures, data key without AAD, KMS streaming 2026-02-01 18:12:03 +08:00
5d6cb4efa1 Update documentation 2026-02-01 15:18:20 +08:00
56ad83bbaf Fix bidirectional sync UI issues 2026-02-01 14:56:20 +08:00
847933b7c0 Add UI endpoint for bidirectional-status to fix 403 auth error 2026-02-01 14:30:55 +08:00
be55d08c0a Fix bidirectional-status 404 when UI runs separately from API 2026-02-01 14:23:35 +08:00
8c4bf67974 Fix 15 security vulnerabilities across auth, storage, and API modules 2026-01-31 00:55:27 +08:00
9385d1fe1c Add 4 new S3 APIs: UploadPartCopy, Bucket Replication, PostObject, SelectObjectContent 2026-01-29 12:51:00 +08:00
0ea54457e8 Fix 17 security vulnerabilities across encryption, auth, and API modules 2026-01-29 12:05:35 +08:00
ae26d22388 Add bidirectional replication setup verification and improved UX warnings 2026-01-26 23:29:20 +08:00
6b715851b9 Add replication setup wizard and site-level sync dashboard for site registry 2026-01-26 21:39:47 +08:00
62c36f7a6c Add site registry UI and update documentation for geo-distribution 2026-01-26 19:49:23 +08:00
b32f1f94f7 Add configurable env variables for hardcoded timeouts and limits 2026-01-25 23:32:36 +08:00
6e3d280a75 Add SlowDown error code tracking for 429 rate limit responses 2026-01-25 21:29:58 +08:00
704f79dc44 Add configurable rate limits for S3 API endpoints 2026-01-25 20:15:38 +08:00
87c7f1bc7d Add bidirectional mode option to replication panel UI 2026-01-25 12:35:14 +08:00
23ea164215 Add bi-directional site replication with LWW conflict resolution 2026-01-24 19:38:17 +08:00
7a8acfb933 Add missing lifecycle and cors actions to Full control template 2026-01-22 11:12:23 +08:00
71327bcbf1 Add dynamic updates to System Health section on metrics page 2026-01-22 11:06:53 +08:00
c0603c592b Add configurable server threads and connections 2026-01-22 10:58:44 +08:00
912a7dc74f Add background collection for system metrics 2026-01-20 00:00:31 +08:00
4de936cea9 Update docs 2026-01-19 12:33:47 +08:00
adb9017580 Add operation metrics with logging integration in metrics UI 2026-01-18 23:50:47 +08:00
4adfcc4131 Improve pytest tests 2026-01-18 21:53:39 +08:00
ebc315c1cc Fix routing conflicts: move admin endpoints to reserved paths 2026-01-18 21:35:39 +08:00
5ab62a00ff Fix security vulnerabilities: XXE, timing attacks, info leaks 2026-01-18 17:18:12 +08:00
9c3518de63 Add new filetype previews; Remove metadata from bucket streaming 2026-01-17 15:40:58 +08:00
a52657e684 enhance date formats with timezone 2026-01-16 20:19:52 +08:00
53297abe1e Add metrics history with charts, fix percentage formatting to 2 d.p. 2026-01-16 19:57:23 +08:00
a3b9db544c Add file type icons, enhance bucket date format, fix metadata display bug 2026-01-16 13:18:06 +08:00
f5d2e1c488 Fix last_modified field still returning wrong timezone 2026-01-14 23:07:47 +08:00
f04c6a9cdc Fix reflect time zone in object browser 2026-01-14 22:51:41 +08:00
7a494abb96 Reflect timezone in Object Details; Fix latest IAM bucket policy bugs 2026-01-14 22:47:29 +08:00
956d17a649 Add new bucket policies; update docs 2026-01-14 22:05:31 +08:00
5522f9ac04 Fix missing column for 'Abort Incomplete MPU' in the lifecycle panel 2026-01-14 21:48:06 +08:00
3742f0228e Fix timezone UI not reflecting correctly 2026-01-14 21:41:41 +08:00
ba694cb717 Fix bucket policy Condition evaluation by passing request context 2026-01-12 23:33:35 +08:00
433d291b4b Fix missing lifecycle configs; Remove Load more button; Remove batch dropdown; Add bucket policy Condition support 2026-01-12 22:36:26 +08:00
e3509e997f Add AGPLv3 license 2026-01-12 15:52:34 +08:00
1c30200db0 UI: Add upload cancellation toggle 2026-01-12 15:35:20 +08:00
7ff422d4dc Fix multipart upload listing API and remove duplicate upload notification 2026-01-12 15:20:45 +08:00
546d51af9a Optimize object listing for 100K+ objects with streaming and compression 2026-01-12 14:25:07 +08:00
0d1fe05fd0 Implement dynamic UI loading 2026-01-11 22:36:04 +08:00
c5d4b2f1cd Fix UI/UX issues: lifecycle warnings, CORS tooltip, IAM overflow, config validation, JS maintenance 2026-01-09 12:47:37 +08:00
a5d19e2982 Replace confirm() with modal for clear failures; add loading states to retry buttons 2026-01-05 23:40:37 +08:00
692e7e3a6e Further fix on text overflow in failed replications table 2026-01-05 23:27:42 +08:00
78dba93ee0 Fix text overflow in failed replications table 2026-01-05 23:19:51 +08:00
93a5aa6618 Add replication failure tracking and lifecycle execution history 2026-01-05 00:18:08 +08:00
9ab750650c Update docs; Improve new panel icons 2026-01-04 21:28:37 +08:00
609e9db2f7 Add redirect /ui/buckets URL to /ui/ 2026-01-04 14:34:59 +08:00
94a55cf2b7 Revamp navbar - implement new sidepanel navigation 2026-01-04 14:00:03 +08:00
b9cfc45aa2 Add new tests; Fix typo and validations 2026-01-03 23:29:07 +08:00
2d60e36fbf Fix multipart upload failure; Improve upload UX; 2026-01-03 17:27:46 +08:00
c78f7fa6b0 Fix IAM message box modal delete user icon 2026-01-01 22:55:21 +08:00
b3dce8d13e Fix Remove fallback ETag, make etag optional, fix multipart ETag storage, fix request entity too large error due to mishandled multipart uploads 2026-01-01 21:51:01 +08:00
e792b86485 (UI): Add lifecycle, CORS, ACL, move/copy objects functionalities 2026-01-01 16:48:44 +08:00
cdb86aeea7 Implement Object Lock, Event Notifications, SSE-C, and Access Logging 2025-12-31 23:40:46 +08:00
cdbc156b5b Implement 9 S3 compatibility features: ACLs, range requests, lifecycle enforcement, replication ALL mode, bulk delete with VersionId, KMS integration, copy conditionals, response header overrides, and SigV4 session tokens 2025-12-31 19:12:54 +08:00
1df8ff9d25 Clean up code comments 2025-12-31 18:00:03 +08:00
05f1b00473 Update Dockerfile Python runtime 2025-12-31 14:13:30 +08:00
5ebc97300e Update README 2025-12-31 14:12:37 +08:00
d2f9c3bded Update README 2025-12-31 14:10:55 +08:00
9f347f2caa Fix brand typos 2025-12-31 14:02:50 +08:00
4ab58e59c2 Optimize S3 performance: add caching, per-bucket locks, streaming encryption 2025-12-29 18:12:28 +08:00
32232211a1 Revamp UI/UX: bucket icons, dynamic metrics, mobile docs navigation, rework IAM UI, add JSON auto-indent to policy editors 2025-12-29 17:37:56 +08:00
1cacb80dd6 Fix replication pause, multipart cache, and select all with virtual scroll 2025-12-29 14:46:06 +08:00
e89bbb62dc Fix pausing replication and resuming replication does not continue the replication for the remaining pending objects; Improve Documentation 2025-12-29 14:05:17 +08:00
c8eb3de629 Fix issues -- Bug fixes:
- Fix duplicate _legacy_version_dir check in storage.py
      - Fix max_size_bytes -> max_bytes param in quota handler
      - Move base64 import to module level in s3_api.py
      - Add retry logic and atomic file ops to multipart upload
      - Add shutdown() method to ReplicationManager

      Performance:
      - Add LRU eviction with OrderedDict to object cache
      - Add cache version tracking for stale read detection
      - Add streaming uploads for large files (>10 MiB) in replication
      - Create _find_element() XML parsing helpers

      Security:
      - Gate SigV4 debug logging behind DEBUG_SIGV4 config
2025-12-29 12:46:23 +08:00
9165e365e6 Comment cleanup 2025-12-23 13:57:13 +08:00
01e26754e8 Add option to display custom timezone; Fix timezone inconsistencies 2025-12-23 13:48:02 +08:00
b592fa9fdb Fixed replication issue - clean up debug 2025-12-23 13:37:51 +08:00
cd9734b398 Debug replication corruption issue - Fix attempt 2025-12-23 13:24:05 +08:00
90893cac27 Debug replication corruption issue - check if it's boto3 issue 2025-12-23 12:02:26 +08:00
6e659902bd Addd header debugging for replication issue 2025-12-23 11:55:47 +08:00
39a707ecbc Add additional debugging for replication issue 2025-12-23 11:49:51 +08:00
4199f8e6c7 Add debugging for replication issue 2025-12-23 11:43:29 +08:00
adc6770273 Improve object browser search filter; Test: Fix replication GIF issue 2025-12-23 11:31:32 +08:00
f5451c162b Improve object storage performance via caching 2025-12-22 17:03:33 +08:00
aab9ef696a Fix race condition in replication 2025-12-22 14:14:04 +08:00
be48f59452 Improve UI bucket replication and policy 2025-12-22 13:34:24 +08:00
86c04f85f6 Fix bucket object browser nested object action button; Improve UX 2025-12-22 13:17:27 +08:00
992d9eccd9 Update docs 2025-12-22 11:09:29 +08:00
40f3192c5c Add fallback button for object loading 2025-12-22 10:46:32 +08:00
2498b950f6 Update requirements.txt 2025-12-22 10:40:05 +08:00
97435f15e5 Revamp object bucket browser logic; Add new tests 2025-12-22 10:04:36 +08:00
97860669ec Fix presigned URL not generating for nested objects 2025-12-21 14:22:00 +08:00
4a5dd76286 Update installation and uninstallation scripts 2025-12-21 14:00:31 +08:00
d2dc293722 Fix inconsistency in config files 2025-12-21 13:17:33 +08:00
563bb8fa6a Fix incorrect STORAGE_ROOT setup; Add installation scripts 2025-12-13 22:26:43 +08:00
5ccf53b688 Add app uptime and version status in Metrics dashboard 2025-12-13 16:18:38 +08:00
4d4256830a Update docs; Remove unnecessary hardcoded metrics details 2025-12-13 15:57:13 +08:00
137e3b7b68 Configure CORS default settings 2025-12-13 15:33:40 +08:00
114e684cb8 Add logging to file missing 2025-12-03 12:11:42 +08:00
5d161c1d92 Fix presigned URL encoding issue 2025-12-03 12:08:02 +08:00
f160827b41 Update requirements.txt to the latest versions 2025-12-03 11:53:25 +08:00
9368715b16 Add bucket quota; Versioned objects now count towards the object storage and size count usage 2025-12-03 11:48:08 +08:00
453ac6ea30 Fix SSE, KMS not encrypting files 2025-12-03 10:03:29 +08:00
804f46d11e Update docs on new SSE, KMS encryptions 2025-12-01 17:49:35 +08:00
766dbb18be Add new SSE, KMS encryptions 2025-12-01 00:46:12 +08:00
590a39ca80 Update IAM controlsd and ad new S3 actions 2025-11-30 23:58:21 +08:00
53326f4e41 Improve bucket details UI layout 2025-11-30 21:19:35 +08:00
6a31a9082e Fix IAM caching issue 2025-11-26 12:52:53 +08:00
aaa230b19b Test fix multipart failing upload 2025-11-25 23:56:38 +08:00
86138636db Improve and standardized error handling 2025-11-25 23:33:01 +08:00
b2f4d1b5db UI overhaul; Replication and S3 API improvements 2025-11-25 14:42:33 +08:00
cee28c9f81 Set CPU call to 0.1 interval to account for other environments for CPU usage calculation 2025-11-23 23:45:09 +08:00
85ee5b9388 Add new metrics function 2025-11-23 23:33:53 +08:00
e6ee341b93 Undo:
Fix hardcoded localhost fallback
2025-11-22 23:28:00 +08:00
92cf8825cf Update docs 2025-11-22 23:18:16 +08:00
ef781ae0b1 Fix hardcoded localhost fallback 2025-11-22 23:03:50 +08:00
37d372c617 Add missing CreateMultipartUpload in API 2025-11-22 22:00:24 +08:00
a095616569 Prepare for binary release 2025-11-22 20:32:57 +08:00
dddab6dbbc Change logging method 2025-11-22 17:47:01 +08:00
015c9cb52d Cleanup setup 2025-11-22 17:41:23 +08:00
c8b1c33118 Switch gunicorn to waitress 2025-11-22 17:20:52 +08:00
ebef3dfa57 Second test for Server Header change 2025-11-22 16:16:10 +08:00
1116353d0f Update docker-entrypoint.sh 2025-11-22 16:10:15 +08:00
e4b92a32a1 Fix and test custom server header 2025-11-22 16:09:24 +08:00
57c40dcdcc Test server header 2025-11-22 15:51:43 +08:00
7d1735a59f Fix server headers 2025-11-22 15:20:42 +08:00
9064f9d60e Fix CSRF token issue on login 2025-11-22 15:13:33 +08:00
36c08b0ac1 Update dockerfile with gunicorn for prod 2025-11-22 15:06:17 +08:00
ec5d52f208 Improve and add two-way replication functionality; Update docs 2025-11-22 15:02:29 +08:00
96de6164d1 Replication fixes 2025-11-22 14:45:21 +08:00
8c00d7bd4b Enhance replication functionalilty 2025-11-22 14:32:28 +08:00
a32d9dbd77 Fix replication corruption issue 2025-11-22 14:13:41 +08:00
fe3eacd2be Debug replication corruption issue 2025-11-22 12:56:33 +08:00
471cf5a305 Debug replication corruption issue 2025-11-22 12:11:41 +08:00
840fd176d3 Add missing CSRF tokens 2025-11-21 23:16:45 +08:00
5350d04ba5 Add missing CSRF token in connections.html 2025-11-21 23:04:56 +08:00
f2daa8a8a3 Fix IAM credentials reset causing presigned URL to fail 2025-11-21 22:32:42 +08:00
e287b59645 Fix Dockerfile permission issues 2025-11-21 22:11:38 +08:00
155 changed files with 68830 additions and 3283 deletions

15
.dockerignore Normal file
View File

@@ -0,0 +1,15 @@
.git
.gitignore
.venv
__pycache__
*.pyc
*.pyo
*.pyd
.pytest_cache
.coverage
htmlcov
logs
data
tmp
myfsio_core/target
myfsio-engine/target

7
.gitignore vendored
View File

@@ -26,6 +26,13 @@ dist/
*.egg-info/
.eggs/
# Rust / maturin build artifacts
myfsio_core/target/
myfsio_core/Cargo.lock
# Rust engine build artifacts
myfsio-engine/target/
# Local runtime artifacts
logs/
*.log

View File

@@ -1,32 +1,50 @@
# syntax=docker/dockerfile:1.7
FROM python:3.11-slim
FROM python:3.14.3-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
WORKDIR /app
# Install build deps for any wheels that need compilation, then clean up
RUN apt-get update \
&& apt-get install -y --no-install-recommends build-essential \
&& apt-get install -y --no-install-recommends build-essential curl \
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal \
&& rm -rf /var/lib/apt/lists/*
ENV PATH="/root/.cargo/bin:${PATH}"
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
# Drop privileges
RUN useradd -m -u 1000 myfsio \
RUN pip install --no-cache-dir maturin \
&& cd myfsio_core \
&& maturin build --release \
&& pip install target/wheels/*.whl \
&& cd ../myfsio-engine \
&& cargo build --release \
&& cp target/release/myfsio-server /usr/local/bin/myfsio-server \
&& cd .. \
&& rm -rf myfsio_core/target myfsio-engine/target \
&& pip uninstall -y maturin \
&& rustup self uninstall -y
RUN chmod +x docker-entrypoint.sh
RUN mkdir -p /app/data \
&& useradd -m -u 1000 myfsio \
&& chown -R myfsio:myfsio /app
USER myfsio
EXPOSE 5000 5100
ENV APP_HOST=0.0.0.0 \
FLASK_ENV=production \
FLASK_DEBUG=0
FLASK_DEBUG=0 \
ENGINE=rust
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD python -c "import requests; requests.get('http://localhost:5000/healthz', timeout=2)"
CMD python -c "import requests; requests.get('http://localhost:5000/myfsio/health', timeout=2)"
CMD ["python", "run.py", "--mode", "both"]
CMD ["./docker-entrypoint.sh"]

661
LICENSE Normal file
View File

@@ -0,0 +1,661 @@
GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.
A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate. Many developers of free software are heartened and
encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals. This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU Affero General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Remote Network Interaction; Use with the GNU General Public License.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time. Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source. For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code. There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<https://www.gnu.org/licenses/>.

302
README.md
View File

@@ -1,117 +1,255 @@
# MyFSIO (Flask S3 + IAM)
# MyFSIO
MyFSIO is a batteries-included, Flask-based recreation of Amazon S3 and IAM workflows built for local development. The design mirrors the [AWS S3 documentation](https://docs.aws.amazon.com/s3/) wherever practical: bucket naming, Signature Version 4 presigning, Version 2012-10-17 bucket policies, IAM-style users, and familiar REST endpoints.
A lightweight, S3-compatible object storage system built with Flask. MyFSIO implements core AWS S3 REST API operations with filesystem-backed storage, making it ideal for local development, testing, and self-hosted storage scenarios.
## Why MyFSIO?
## Features
- **Dual servers:** Run both the API (port 5000) and UI (port 5100) with a single command: `python run.py`.
- **IAM + access keys:** Users, access keys, key rotation, and bucket-scoped actions (`list/read/write/delete/policy`) now live in `data/.myfsio.sys/config/iam.json` and are editable from the IAM dashboard.
- **Bucket policies + hot reload:** `data/.myfsio.sys/config/bucket_policies.json` uses AWS' policy grammar (Version `2012-10-17`) with a built-in watcher, so editing the JSON file applies immediately. The UI also ships Public/Private/Custom presets for faster edits.
- **Presigned URLs everywhere:** Signature Version 4 presigned URLs respect IAM + bucket policies and replace the now-removed "share link" feature for public access scenarios.
- **Modern UI:** Responsive tables, quick filters, preview sidebar, object-level delete buttons, a presign modal, and an inline JSON policy editor that respects dark mode keep bucket management friendly.
- **Tests & health:** `/healthz` for smoke checks and `pytest` coverage for IAM, CRUD, presign, and policy flows.
**Core Storage**
- S3-compatible REST API with AWS Signature Version 4 authentication
- Bucket and object CRUD operations
- Object versioning with version history
- Multipart uploads for large files
- Presigned URLs (1 second to 7 days validity)
## Architecture at a Glance
**Security & Access Control**
- IAM users with access key management and rotation
- Bucket policies (AWS Policy Version 2012-10-17)
- Server-side encryption (SSE-S3 and SSE-KMS)
- Built-in Key Management Service (KMS)
- Rate limiting per endpoint
**Advanced Features**
- Cross-bucket replication to remote S3-compatible endpoints
- Hot-reload for bucket policies (no restart required)
- CORS configuration per bucket
**Management UI**
- Web console for bucket and object management
- IAM dashboard for user administration
- Inline JSON policy editor with presets
- Object browser with folder navigation and bulk operations
- Dark mode support
## Architecture
```
+-----------------+ +----------------+
| API Server |<----->| Object storage |
| (port 5000) | | (filesystem) |
| - S3 routes | +----------------+
| - Presigned URLs |
| - Bucket policy |
+-----------------+
^
+------------------+ +------------------+
| API Server | | UI Server |
| (port 5000) | | (port 5100) |
| | | |
| - S3 REST API |<------->| - Web Console |
| - SigV4 Auth | | - IAM Dashboard |
| - Presign URLs | | - Bucket Editor |
+--------+---------+ +------------------+
|
+-----------------+
| UI Server |
| (port 5100) |
| - Auth console |
| - IAM dashboard|
| - Bucket editor|
+-----------------+
v
+------------------+ +------------------+
| Object Storage | | System Metadata |
| (filesystem) | | (.myfsio.sys/) |
| | | |
| data/<bucket>/ | | - IAM config |
| <objects> | | - Bucket policies|
| | | - Encryption keys|
+------------------+ +------------------+
```
Both apps load the same configuration via `AppConfig` so IAM data and bucket policies stay consistent no matter which process you run.
Bucket policies are automatically reloaded whenever `bucket_policies.json` changes—no restarts required.
## Getting Started
## Quick Start
```bash
# Clone and setup
git clone https://gitea.jzwsite.com/kqjy/MyFSIO
cd s3
python -m venv .venv
. .venv/Scripts/activate # PowerShell: .\.venv\Scripts\Activate.ps1
# Activate virtual environment
# Windows PowerShell:
.\.venv\Scripts\Activate.ps1
# Windows CMD:
.venv\Scripts\activate.bat
# Linux/macOS:
source .venv/bin/activate
# Install dependencies
pip install -r requirements.txt
# Run both API and UI (default)
# (Optional) Build Rust native extension for better performance
# Requires Rust toolchain: https://rustup.rs
pip install maturin
cd myfsio_core && maturin develop --release && cd ..
# Start both servers
python run.py
# Or run individually:
# python run.py --mode api
# python run.py --mode ui
# Or start individually
python run.py --mode api # API only (port 5000)
python run.py --mode ui # UI only (port 5100)
```
Visit `http://127.0.0.1:5100/ui` for the console and `http://127.0.0.1:5000/` for the raw API. Override ports/hosts with the environment variables listed below.
**Credentials:** Generated automatically on first run and printed to the console. If missed, check the IAM config file at `<STORAGE_ROOT>/.myfsio.sys/config/iam.json`.
## IAM, Access Keys, and Bucket Policies
- First run creates `data/.myfsio.sys/config/iam.json` with `localadmin / localadmin` (full control). Sign in via the UI, then use the **IAM** tab to create users, rotate secrets, or edit inline policies without touching JSON by hand.
- Bucket policies live in `data/.myfsio.sys/config/bucket_policies.json` and follow the AWS `arn:aws:s3:::bucket/key` resource syntax with Version `2012-10-17`. Attach/replace/remove policies from the bucket detail page or edit the JSON by hand—changes hot reload automatically.
- IAM actions include extended verbs (`iam:list_users`, `iam:create_user`, `iam:update_policy`, etc.) so you can control who is allowed to manage other users and policies.
### Bucket Policy Presets & Hot Reload
- **Presets:** Every bucket detail view includes Public (read-only), Private (detach policy), and Custom presets. Public auto-populates a policy that grants anonymous `s3:ListBucket` + `s3:GetObject` access to the entire bucket.
- **Custom drafts:** Switching back to Custom restores your last manual edit so you can toggle between presets without losing work.
- **Hot reload:** The server watches `bucket_policies.json` and reloads statements on-the-fly—ideal for editing policies in your favorite editor while testing Via curl or the UI.
## Presigned URLs
Presigned URLs follow the AWS CLI playbook:
- Call `POST /presign/<bucket>/<key>` (or use the "Presign" button in the UI) to request a Signature Version 4 URL valid for 1 second to 7 days.
- The generated URL honors IAM permissions and bucket-policy decisions at generation-time and again when somebody fetches it.
- Because presigned URLs cover both authenticated and public sharing scenarios, the legacy "share link" feature has been removed.
- **Web Console:** http://127.0.0.1:5100/ui
- **API Endpoint:** http://127.0.0.1:5000
## Configuration
| Variable | Default | Description |
| --- | --- | --- |
| `STORAGE_ROOT` | `<project>/data` | Filesystem root for bucket directories |
| `MAX_UPLOAD_SIZE` | `1073741824` | Maximum upload size (bytes) |
| `UI_PAGE_SIZE` | `100` | `MaxKeys` hint for listings |
| `SECRET_KEY` | `dev-secret-key` | Flask session secret for the UI |
| `IAM_CONFIG` | `<project>/data/.myfsio.sys/config/iam.json` | IAM user + policy store |
| `BUCKET_POLICY_PATH` | `<project>/data/.myfsio.sys/config/bucket_policies.json` | Bucket policy store |
| `API_BASE_URL` | `http://127.0.0.1:5000` | Used by the UI when calling API endpoints (presign, bucket policy) |
| `AWS_REGION` | `us-east-1` | Region used in Signature V4 scope |
| `AWS_SERVICE` | `s3` | Service used in Signature V4 scope |
|----------|---------|-------------|
| `STORAGE_ROOT` | `./data` | Filesystem root for bucket storage |
| `IAM_CONFIG` | `.myfsio.sys/config/iam.json` | IAM user and policy store |
| `BUCKET_POLICY_PATH` | `.myfsio.sys/config/bucket_policies.json` | Bucket policy store |
| `API_BASE_URL` | `http://127.0.0.1:5000` | API endpoint for UI calls |
| `MAX_UPLOAD_SIZE` | `1073741824` | Maximum upload size in bytes (1 GB) |
| `MULTIPART_MIN_PART_SIZE` | `5242880` | Minimum multipart part size (5 MB) |
| `UI_PAGE_SIZE` | `100` | Default page size for listings |
| `SECRET_KEY` | `dev-secret-key` | Flask session secret |
| `AWS_REGION` | `us-east-1` | Region for SigV4 signing |
| `AWS_SERVICE` | `s3` | Service name for SigV4 signing |
| `ENCRYPTION_ENABLED` | `false` | Enable server-side encryption |
| `KMS_ENABLED` | `false` | Enable Key Management Service |
| `LOG_LEVEL` | `INFO` | Logging verbosity |
| `SIGV4_TIMESTAMP_TOLERANCE_SECONDS` | `900` | Max time skew for SigV4 requests |
| `PRESIGNED_URL_MAX_EXPIRY_SECONDS` | `604800` | Max presigned URL expiry (7 days) |
| `REPLICATION_CONNECT_TIMEOUT_SECONDS` | `5` | Replication connection timeout |
| `SITE_SYNC_ENABLED` | `false` | Enable bi-directional site sync |
| `OBJECT_TAG_LIMIT` | `50` | Maximum tags per object |
> Buckets now live directly under `data/` while system metadata (versions, IAM, bucket policies, multipart uploads, etc.) lives in `data/.myfsio.sys`. Existing installs can keep their environment variables, but the defaults now match MinIO's `data/.system` pattern for easier bind-mounting.
## API Cheatsheet (IAM headers required)
## Data Layout
```
GET / -> List buckets (XML)
PUT /<bucket> -> Create bucket
DELETE /<bucket> -> Delete bucket (must be empty)
GET /<bucket> -> List objects (XML)
PUT /<bucket>/<key> -> Upload object (binary stream)
GET /<bucket>/<key> -> Download object
DELETE /<bucket>/<key> -> Delete object
POST /presign/<bucket>/<key> -> Generate AWS SigV4 presigned URL (JSON)
GET /bucket-policy/<bucket> -> Fetch bucket policy (JSON)
PUT /bucket-policy/<bucket> -> Attach/replace bucket policy (JSON)
DELETE /bucket-policy/<bucket> -> Remove bucket policy
data/
├── <bucket>/ # User buckets with objects
└── .myfsio.sys/ # System metadata
├── config/
│ ├── iam.json # IAM users and policies
│ ├── bucket_policies.json # Bucket policies
│ ├── replication_rules.json
│ └── connections.json # Remote S3 connections
├── buckets/<bucket>/
│ ├── meta/ # Object metadata (.meta.json)
│ ├── versions/ # Archived object versions
│ └── .bucket.json # Bucket config (versioning, CORS)
├── multipart/ # Active multipart uploads
└── keys/ # Encryption keys (SSE-S3/KMS)
```
## API Reference
All endpoints require AWS Signature Version 4 authentication unless using presigned URLs or public bucket policies.
### Bucket Operations
| Method | Endpoint | Description |
|--------|----------|-------------|
| `GET` | `/` | List all buckets |
| `PUT` | `/<bucket>` | Create bucket |
| `DELETE` | `/<bucket>` | Delete bucket (must be empty) |
| `HEAD` | `/<bucket>` | Check bucket exists |
### Object Operations
| Method | Endpoint | Description |
|--------|----------|-------------|
| `GET` | `/<bucket>` | List objects (supports `list-type=2`) |
| `PUT` | `/<bucket>/<key>` | Upload object |
| `GET` | `/<bucket>/<key>` | Download object |
| `DELETE` | `/<bucket>/<key>` | Delete object |
| `HEAD` | `/<bucket>/<key>` | Get object metadata |
| `POST` | `/<bucket>/<key>?uploads` | Initiate multipart upload |
| `PUT` | `/<bucket>/<key>?partNumber=N&uploadId=X` | Upload part |
| `POST` | `/<bucket>/<key>?uploadId=X` | Complete multipart upload |
| `DELETE` | `/<bucket>/<key>?uploadId=X` | Abort multipart upload |
### Bucket Policies (S3-compatible)
| Method | Endpoint | Description |
|--------|----------|-------------|
| `GET` | `/<bucket>?policy` | Get bucket policy |
| `PUT` | `/<bucket>?policy` | Set bucket policy |
| `DELETE` | `/<bucket>?policy` | Delete bucket policy |
### Versioning
| Method | Endpoint | Description |
|--------|----------|-------------|
| `GET` | `/<bucket>/<key>?versionId=X` | Get specific version |
| `DELETE` | `/<bucket>/<key>?versionId=X` | Delete specific version |
| `GET` | `/<bucket>?versions` | List object versions |
### Health Check
| Method | Endpoint | Description |
|--------|----------|-------------|
| `GET` | `/myfsio/health` | Health check endpoint |
## IAM & Access Control
### Users and Access Keys
On first run, MyFSIO creates a default admin user (`localadmin`/`localadmin`). Use the IAM dashboard to:
- Create and delete users
- Generate and rotate access keys
- Attach inline policies to users
- Control IAM management permissions
### Bucket Policies
Bucket policies follow AWS policy grammar (Version `2012-10-17`) with support for:
- Principal-based access (`*` for anonymous, specific users)
- Action-based permissions (`s3:GetObject`, `s3:PutObject`, etc.)
- Resource patterns (`arn:aws:s3:::bucket/*`)
- Condition keys
**Policy Presets:**
- **Public:** Grants anonymous read access (`s3:GetObject`, `s3:ListBucket`)
- **Private:** Removes bucket policy (IAM-only access)
- **Custom:** Manual policy editing with draft preservation
Policies hot-reload when the JSON file changes.
## Server-Side Encryption
MyFSIO supports two encryption modes:
- **SSE-S3:** Server-managed keys with automatic key rotation
- **SSE-KMS:** Customer-managed keys via built-in KMS
Enable encryption with:
```bash
ENCRYPTION_ENABLED=true python run.py
```
## Cross-Bucket Replication
Replicate objects to remote S3-compatible endpoints:
1. Configure remote connections in the UI
2. Create replication rules specifying source/destination
3. Objects are automatically replicated on upload
## Docker
```bash
docker build -t myfsio .
docker run -p 5000:5000 -p 5100:5100 -v ./data:/app/data myfsio
```
## Testing
```bash
pytest -q
# Run all tests
pytest tests/ -v
# Run specific test file
pytest tests/test_api.py -v
# Run with coverage
pytest tests/ --cov=app --cov-report=html
```
## References
- [Amazon Simple Storage Service Documentation](https://docs.aws.amazon.com/s3/)
- [Signature Version 4 Signing Process](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html)
- [Amazon S3 Bucket Policy Examples](https://docs.aws.amazon.com/AmazonS3/latest/userguide/example-bucket-policies.html)
- [Amazon S3 Documentation](https://docs.aws.amazon.com/s3/)
- [AWS Signature Version 4](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html)
- [S3 Bucket Policy Examples](https://docs.aws.amazon.com/AmazonS3/latest/userguide/example-bucket-policies.html)

View File

@@ -1,27 +1,133 @@
"""Application factory for the mini S3-compatible object store."""
from __future__ import annotations
import html as html_module
import itertools
import logging
import mimetypes
import os
import shutil
import sys
import time
import uuid
from logging.handlers import RotatingFileHandler
from pathlib import Path
from datetime import timedelta
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
from flask import Flask, g, has_request_context, redirect, render_template, request, url_for
from flask import Flask, Response, g, has_request_context, redirect, render_template, request, url_for
from flask_cors import CORS
from flask_wtf.csrf import CSRFError
from werkzeug.middleware.proxy_fix import ProxyFix
import io
from .access_logging import AccessLoggingService
from .operation_metrics import OperationMetricsCollector, classify_endpoint
from .compression import GzipMiddleware
from .acl import AclService
from .bucket_policies import BucketPolicyStore
from .config import AppConfig
from .connections import ConnectionStore
from .encryption import EncryptionManager
from .extensions import limiter, csrf
from .iam import IamService
from .kms import KMSManager
from .gc import GarbageCollector
from .integrity import IntegrityChecker
from .lifecycle import LifecycleManager
from .notifications import NotificationService
from .object_lock import ObjectLockService
from .replication import ReplicationManager
from .secret_store import EphemeralSecretStore
from .storage import ObjectStorage
from .site_registry import SiteRegistry, SiteInfo
from .storage import ObjectStorage, StorageError
from .version import get_version
from .website_domains import WebsiteDomainStore
_request_counter = itertools.count(1)
class _ChunkedTransferMiddleware:
def __init__(self, app):
self.app = app
def __call__(self, environ, start_response):
if environ.get("REQUEST_METHOD") not in ("PUT", "POST"):
return self.app(environ, start_response)
transfer_encoding = environ.get("HTTP_TRANSFER_ENCODING", "")
content_length = environ.get("CONTENT_LENGTH")
if "chunked" in transfer_encoding.lower():
if content_length:
del environ["HTTP_TRANSFER_ENCODING"]
else:
raw = environ.get("wsgi.input")
if raw:
try:
if hasattr(raw, "seek"):
raw.seek(0)
body = raw.read()
except Exception:
body = b""
if body:
environ["wsgi.input"] = io.BytesIO(body)
environ["CONTENT_LENGTH"] = str(len(body))
del environ["HTTP_TRANSFER_ENCODING"]
content_length = environ.get("CONTENT_LENGTH")
if not content_length or content_length == "0":
sha256 = environ.get("HTTP_X_AMZ_CONTENT_SHA256", "")
decoded_len = environ.get("HTTP_X_AMZ_DECODED_CONTENT_LENGTH", "")
content_encoding = environ.get("HTTP_CONTENT_ENCODING", "")
if ("STREAMING" in sha256.upper() or decoded_len
or "aws-chunked" in content_encoding.lower()):
raw = environ.get("wsgi.input")
if raw:
try:
if hasattr(raw, "seek"):
raw.seek(0)
body = raw.read()
except Exception:
body = b""
if body:
environ["wsgi.input"] = io.BytesIO(body)
environ["CONTENT_LENGTH"] = str(len(body))
raw = environ.get("wsgi.input")
if raw and hasattr(raw, "seek"):
try:
raw.seek(0)
except Exception:
pass
return self.app(environ, start_response)
def _migrate_config_file(active_path: Path, legacy_paths: List[Path]) -> Path:
"""Migrate config file from legacy locations to the active path.
Checks each legacy path in order and moves the first one found to the active path.
This ensures backward compatibility for users upgrading from older versions.
"""
active_path.parent.mkdir(parents=True, exist_ok=True)
if active_path.exists():
return active_path
for legacy_path in legacy_paths:
if legacy_path.exists():
try:
shutil.move(str(legacy_path), str(active_path))
except OSError:
shutil.copy2(legacy_path, active_path)
try:
legacy_path.unlink(missing_ok=True)
except OSError:
pass
break
return active_path
def create_app(
@@ -33,7 +139,11 @@ def create_app(
"""Create and configure the Flask application."""
config = AppConfig.from_env(test_config)
if getattr(sys, "frozen", False):
project_root = Path(sys._MEIPASS)
else:
project_root = Path(__file__).resolve().parent.parent
app = Flask(
__name__,
static_folder=str(project_root / "static"),
@@ -47,27 +157,158 @@ def create_app(
if app.config.get("TESTING"):
app.config.setdefault("WTF_CSRF_ENABLED", False)
# Trust X-Forwarded-* headers from proxies
num_proxies = app.config.get("NUM_TRUSTED_PROXIES", 1)
if num_proxies:
if "NUM_TRUSTED_PROXIES" not in os.environ:
logging.getLogger(__name__).warning(
"NUM_TRUSTED_PROXIES not set, defaulting to 1. "
"Set NUM_TRUSTED_PROXIES=0 if not behind a reverse proxy."
)
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=num_proxies, x_proto=num_proxies, x_host=num_proxies, x_prefix=num_proxies)
if app.config.get("ENABLE_GZIP", True):
app.wsgi_app = GzipMiddleware(app.wsgi_app, compression_level=6)
app.wsgi_app = _ChunkedTransferMiddleware(app.wsgi_app)
_configure_cors(app)
_configure_logging(app)
limiter.init_app(app)
csrf.init_app(app)
storage = ObjectStorage(Path(app.config["STORAGE_ROOT"]))
storage = ObjectStorage(
Path(app.config["STORAGE_ROOT"]),
cache_ttl=app.config.get("OBJECT_CACHE_TTL", 60),
object_cache_max_size=app.config.get("OBJECT_CACHE_MAX_SIZE", 100),
bucket_config_cache_ttl=app.config.get("BUCKET_CONFIG_CACHE_TTL_SECONDS", 30.0),
object_key_max_length_bytes=app.config.get("OBJECT_KEY_MAX_LENGTH_BYTES", 1024),
meta_read_cache_max=app.config.get("META_READ_CACHE_MAX", 2048),
)
if app.config.get("WARM_CACHE_ON_STARTUP", True) and not app.config.get("TESTING"):
storage.warm_cache_async()
iam = IamService(
Path(app.config["IAM_CONFIG"]),
auth_max_attempts=app.config.get("AUTH_MAX_ATTEMPTS", 5),
auth_lockout_minutes=app.config.get("AUTH_LOCKOUT_MINUTES", 15),
encryption_key=app.config.get("SECRET_KEY"),
)
bucket_policies = BucketPolicyStore(Path(app.config["BUCKET_POLICY_PATH"]))
secret_store = EphemeralSecretStore(default_ttl=app.config.get("SECRET_TTL_SECONDS", 300))
# Initialize Replication components
connections_path = Path(app.config["STORAGE_ROOT"]) / ".connections.json"
replication_rules_path = Path(app.config["STORAGE_ROOT"]) / ".replication_rules.json"
storage_root = Path(app.config["STORAGE_ROOT"])
config_dir = storage_root / ".myfsio.sys" / "config"
config_dir.mkdir(parents=True, exist_ok=True)
connections_path = _migrate_config_file(
active_path=config_dir / "connections.json",
legacy_paths=[
storage_root / ".myfsio.sys" / "connections.json",
storage_root / ".connections.json",
],
)
replication_rules_path = _migrate_config_file(
active_path=config_dir / "replication_rules.json",
legacy_paths=[
storage_root / ".myfsio.sys" / "replication_rules.json",
storage_root / ".replication_rules.json",
],
)
connections = ConnectionStore(connections_path)
replication = ReplicationManager(storage, connections, replication_rules_path)
replication = ReplicationManager(
storage,
connections,
replication_rules_path,
storage_root,
connect_timeout=app.config.get("REPLICATION_CONNECT_TIMEOUT_SECONDS", 5),
read_timeout=app.config.get("REPLICATION_READ_TIMEOUT_SECONDS", 30),
max_retries=app.config.get("REPLICATION_MAX_RETRIES", 2),
streaming_threshold_bytes=app.config.get("REPLICATION_STREAMING_THRESHOLD_BYTES", 10 * 1024 * 1024),
max_failures_per_bucket=app.config.get("REPLICATION_MAX_FAILURES_PER_BUCKET", 50),
)
site_registry_path = config_dir / "site_registry.json"
site_registry = SiteRegistry(site_registry_path)
if app.config.get("SITE_ID") and not site_registry.get_local_site():
site_registry.set_local_site(SiteInfo(
site_id=app.config["SITE_ID"],
endpoint=app.config.get("SITE_ENDPOINT") or "",
region=app.config.get("SITE_REGION", "us-east-1"),
priority=app.config.get("SITE_PRIORITY", 100),
))
encryption_config = {
"encryption_enabled": app.config.get("ENCRYPTION_ENABLED", False),
"encryption_master_key_path": app.config.get("ENCRYPTION_MASTER_KEY_PATH"),
"default_encryption_algorithm": app.config.get("DEFAULT_ENCRYPTION_ALGORITHM", "AES256"),
"encryption_chunk_size_bytes": app.config.get("ENCRYPTION_CHUNK_SIZE_BYTES", 64 * 1024),
}
encryption_manager = EncryptionManager(encryption_config)
kms_manager = None
if app.config.get("KMS_ENABLED", False):
kms_keys_path = Path(app.config.get("KMS_KEYS_PATH", ""))
kms_master_key_path = Path(app.config.get("ENCRYPTION_MASTER_KEY_PATH", ""))
kms_manager = KMSManager(
kms_keys_path,
kms_master_key_path,
generate_data_key_min_bytes=app.config.get("KMS_GENERATE_DATA_KEY_MIN_BYTES", 1),
generate_data_key_max_bytes=app.config.get("KMS_GENERATE_DATA_KEY_MAX_BYTES", 1024),
)
encryption_manager.set_kms_provider(kms_manager)
if app.config.get("ENCRYPTION_ENABLED", False):
from .encrypted_storage import EncryptedObjectStorage
storage = EncryptedObjectStorage(storage, encryption_manager)
acl_service = AclService(storage_root)
object_lock_service = ObjectLockService(storage_root)
notification_service = NotificationService(
storage_root,
allow_internal_endpoints=app.config.get("ALLOW_INTERNAL_ENDPOINTS", False),
)
access_logging_service = AccessLoggingService(storage_root)
access_logging_service.set_storage(storage)
lifecycle_manager = None
if app.config.get("LIFECYCLE_ENABLED", False):
base_storage = storage.storage if hasattr(storage, 'storage') else storage
lifecycle_manager = LifecycleManager(
base_storage,
interval_seconds=app.config.get("LIFECYCLE_INTERVAL_SECONDS", 3600),
storage_root=storage_root,
max_history_per_bucket=app.config.get("LIFECYCLE_MAX_HISTORY_PER_BUCKET", 50),
)
lifecycle_manager.start()
gc_collector = None
if app.config.get("GC_ENABLED", False):
gc_collector = GarbageCollector(
storage_root=storage_root,
interval_hours=app.config.get("GC_INTERVAL_HOURS", 6.0),
temp_file_max_age_hours=app.config.get("GC_TEMP_FILE_MAX_AGE_HOURS", 24.0),
multipart_max_age_days=app.config.get("GC_MULTIPART_MAX_AGE_DAYS", 7),
lock_file_max_age_hours=app.config.get("GC_LOCK_FILE_MAX_AGE_HOURS", 1.0),
dry_run=app.config.get("GC_DRY_RUN", False),
io_throttle_ms=app.config.get("GC_IO_THROTTLE_MS", 10),
)
gc_collector.start()
integrity_checker = None
if app.config.get("INTEGRITY_ENABLED", False):
integrity_checker = IntegrityChecker(
storage_root=storage_root,
interval_hours=app.config.get("INTEGRITY_INTERVAL_HOURS", 24.0),
batch_size=app.config.get("INTEGRITY_BATCH_SIZE", 1000),
auto_heal=app.config.get("INTEGRITY_AUTO_HEAL", False),
dry_run=app.config.get("INTEGRITY_DRY_RUN", False),
io_throttle_ms=app.config.get("INTEGRITY_IO_THROTTLE_MS", 10),
)
integrity_checker.start()
app.extensions["object_storage"] = storage
app.extensions["iam"] = iam
@@ -76,14 +317,101 @@ def create_app(
app.extensions["limiter"] = limiter
app.extensions["connections"] = connections
app.extensions["replication"] = replication
app.extensions["encryption"] = encryption_manager
app.extensions["kms"] = kms_manager
app.extensions["acl"] = acl_service
app.extensions["lifecycle"] = lifecycle_manager
app.extensions["gc"] = gc_collector
app.extensions["integrity"] = integrity_checker
app.extensions["object_lock"] = object_lock_service
app.extensions["notifications"] = notification_service
app.extensions["access_logging"] = access_logging_service
app.extensions["site_registry"] = site_registry
website_domains_store = None
if app.config.get("WEBSITE_HOSTING_ENABLED", False):
website_domains_path = config_dir / "website_domains.json"
website_domains_store = WebsiteDomainStore(website_domains_path)
app.extensions["website_domains"] = website_domains_store
from .s3_client import S3ProxyClient
api_base = app.config.get("API_BASE_URL") or "http://127.0.0.1:5000"
app.extensions["s3_proxy"] = S3ProxyClient(
api_base_url=api_base,
region=app.config.get("AWS_REGION", "us-east-1"),
)
operation_metrics_collector = None
if app.config.get("OPERATION_METRICS_ENABLED", False):
operation_metrics_collector = OperationMetricsCollector(
storage_root,
interval_minutes=app.config.get("OPERATION_METRICS_INTERVAL_MINUTES", 5),
retention_hours=app.config.get("OPERATION_METRICS_RETENTION_HOURS", 24),
)
app.extensions["operation_metrics"] = operation_metrics_collector
system_metrics_collector = None
if app.config.get("METRICS_HISTORY_ENABLED", False):
from .system_metrics import SystemMetricsCollector
system_metrics_collector = SystemMetricsCollector(
storage_root,
interval_minutes=app.config.get("METRICS_HISTORY_INTERVAL_MINUTES", 5),
retention_hours=app.config.get("METRICS_HISTORY_RETENTION_HOURS", 24),
)
system_metrics_collector.set_storage(storage)
app.extensions["system_metrics"] = system_metrics_collector
site_sync_worker = None
if app.config.get("SITE_SYNC_ENABLED", False):
from .site_sync import SiteSyncWorker
site_sync_worker = SiteSyncWorker(
storage=storage,
connections=connections,
replication_manager=replication,
storage_root=storage_root,
interval_seconds=app.config.get("SITE_SYNC_INTERVAL_SECONDS", 60),
batch_size=app.config.get("SITE_SYNC_BATCH_SIZE", 100),
connect_timeout=app.config.get("SITE_SYNC_CONNECT_TIMEOUT_SECONDS", 10),
read_timeout=app.config.get("SITE_SYNC_READ_TIMEOUT_SECONDS", 120),
max_retries=app.config.get("SITE_SYNC_MAX_RETRIES", 2),
clock_skew_tolerance_seconds=app.config.get("SITE_SYNC_CLOCK_SKEW_TOLERANCE_SECONDS", 1.0),
)
site_sync_worker.start()
app.extensions["site_sync"] = site_sync_worker
@app.errorhandler(500)
def internal_error(error):
wants_html = request.accept_mimetypes.accept_html
path = request.path or ""
if include_ui and wants_html and (path.startswith("/ui") or path == "/"):
return render_template('500.html'), 500
error_xml = (
'<?xml version="1.0" encoding="UTF-8"?>'
'<Error>'
'<Code>InternalError</Code>'
'<Message>An internal server error occurred</Message>'
f'<Resource>{path}</Resource>'
f'<RequestId>{getattr(g, "request_id", "-")}</RequestId>'
'</Error>'
)
return error_xml, 500, {'Content-Type': 'application/xml'}
@app.errorhandler(CSRFError)
def handle_csrf_error(e):
wants_html = request.accept_mimetypes.accept_html
path = request.path or ""
if include_ui and wants_html and (path.startswith("/ui") or path == "/"):
return render_template('csrf_error.html', reason=e.description), 400
error_xml = (
'<?xml version="1.0" encoding="UTF-8"?>'
'<Error>'
'<Code>CSRFError</Code>'
f'<Message>{e.description}</Message>'
f'<Resource>{path}</Resource>'
f'<RequestId>{getattr(g, "request_id", "-")}</RequestId>'
'</Error>'
)
return error_xml, 400, {'Content-Type': 'application/xml'}
@app.template_filter("filesizeformat")
def filesizeformat(value: int) -> str:
@@ -96,11 +424,62 @@ def create_app(
value /= 1024.0
return f"{value:.1f} PB"
@app.template_filter("timestamp_to_datetime")
def timestamp_to_datetime(value: float) -> str:
"""Format Unix timestamp as human-readable datetime in configured timezone."""
from datetime import datetime, timezone as dt_timezone
from zoneinfo import ZoneInfo
if not value:
return "Never"
try:
dt_utc = datetime.fromtimestamp(value, dt_timezone.utc)
display_tz = app.config.get("DISPLAY_TIMEZONE", "UTC")
if display_tz and display_tz != "UTC":
try:
tz = ZoneInfo(display_tz)
dt_local = dt_utc.astimezone(tz)
return dt_local.strftime("%Y-%m-%d %H:%M:%S")
except (KeyError, ValueError):
pass
return dt_utc.strftime("%Y-%m-%d %H:%M:%S UTC")
except (ValueError, OSError):
return "Unknown"
@app.template_filter("format_datetime")
def format_datetime_filter(dt, include_tz: bool = True) -> str:
"""Format datetime object as human-readable string in configured timezone."""
from datetime import datetime, timezone as dt_timezone
from zoneinfo import ZoneInfo
if not dt:
return ""
try:
display_tz = app.config.get("DISPLAY_TIMEZONE", "UTC")
if display_tz and display_tz != "UTC":
try:
tz = ZoneInfo(display_tz)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=dt_timezone.utc)
dt = dt.astimezone(tz)
except (KeyError, ValueError):
pass
tz_abbr = dt.strftime("%Z") or "UTC"
if include_tz:
return f"{dt.strftime('%b %d, %Y %H:%M')} ({tz_abbr})"
return dt.strftime("%b %d, %Y %H:%M")
except (ValueError, AttributeError):
return str(dt)
if include_api:
from .s3_api import s3_api_bp
from .kms_api import kms_api_bp
from .admin_api import admin_api_bp
app.register_blueprint(s3_api_bp)
app.register_blueprint(kms_api_bp)
app.register_blueprint(admin_api_bp)
csrf.exempt(s3_api_bp)
csrf.exempt(kms_api_bp)
csrf.exempt(admin_api_bp)
if include_ui:
from .ui import ui_bp
@@ -120,9 +499,9 @@ def create_app(
return render_template("404.html"), 404
return error
@app.get("/healthz")
@app.get("/myfsio/health")
def healthcheck() -> Dict[str, str]:
return {"status": "ok", "version": app.config.get("APP_VERSION", "unknown")}
return {"status": "ok"}
return app
@@ -137,14 +516,12 @@ def create_ui_app(test_config: Optional[Dict[str, Any]] = None) -> Flask:
def _configure_cors(app: Flask) -> None:
origins = app.config.get("CORS_ORIGINS", ["*"])
methods = app.config.get("CORS_METHODS", ["GET", "PUT", "POST", "DELETE", "OPTIONS"])
allow_headers = app.config.get(
"CORS_ALLOW_HEADERS",
["Content-Type", "X-Access-Key", "X-Secret-Key", "X-Amz-Date", "X-Amz-SignedHeaders"],
)
methods = app.config.get("CORS_METHODS", ["GET", "PUT", "POST", "DELETE", "OPTIONS", "HEAD"])
allow_headers = app.config.get("CORS_ALLOW_HEADERS", ["*"])
expose_headers = app.config.get("CORS_EXPOSE_HEADERS", ["*"])
CORS(
app,
resources={r"/*": {"origins": origins, "methods": methods, "allow_headers": allow_headers}},
resources={r"/*": {"origins": origins, "methods": methods, "allow_headers": allow_headers, "expose_headers": expose_headers}},
supports_credentials=True,
)
@@ -152,7 +529,7 @@ def _configure_cors(app: Flask) -> None:
class _RequestContextFilter(logging.Filter):
"""Inject request-specific attributes into log records."""
def filter(self, record: logging.LogRecord) -> bool: # pragma: no cover - simple boilerplate
def filter(self, record: logging.LogRecord) -> bool:
if has_request_context():
record.request_id = getattr(g, "request_id", "-")
record.path = request.path
@@ -167,41 +544,195 @@ class _RequestContextFilter(logging.Filter):
def _configure_logging(app: Flask) -> None:
formatter = logging.Formatter(
"%(asctime)s | %(levelname)s | %(request_id)s | %(method)s %(path)s | %(message)s"
)
stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setFormatter(formatter)
stream_handler.addFilter(_RequestContextFilter())
logger = app.logger
for handler in logger.handlers[:]:
handler.close()
logger.handlers.clear()
logger.addHandler(stream_handler)
if app.config.get("LOG_TO_FILE"):
log_file = Path(app.config["LOG_FILE"])
log_file.parent.mkdir(parents=True, exist_ok=True)
handler = RotatingFileHandler(
file_handler = RotatingFileHandler(
log_file,
maxBytes=int(app.config.get("LOG_MAX_BYTES", 5 * 1024 * 1024)),
backupCount=int(app.config.get("LOG_BACKUP_COUNT", 3)),
encoding="utf-8",
)
formatter = logging.Formatter(
"%(asctime)s | %(levelname)s | %(request_id)s | %(method)s %(path)s | %(message)s"
)
handler.setFormatter(formatter)
handler.addFilter(_RequestContextFilter())
file_handler.setFormatter(formatter)
file_handler.addFilter(_RequestContextFilter())
logger.addHandler(file_handler)
logger = app.logger
logger.handlers.clear()
logger.addHandler(handler)
logger.setLevel(getattr(logging, app.config.get("LOG_LEVEL", "INFO"), logging.INFO))
@app.before_request
def _log_request_start() -> None:
g.request_id = uuid.uuid4().hex
g.request_id = f"{os.getpid():x}{next(_request_counter):012x}"
g.request_started_at = time.perf_counter()
app.logger.info(
"Request started",
extra={"path": request.path, "method": request.method, "remote_addr": request.remote_addr},
)
g.request_bytes_in = request.content_length or 0
@app.before_request
def _maybe_serve_website():
if not app.config.get("WEBSITE_HOSTING_ENABLED"):
return None
if request.method not in {"GET", "HEAD"}:
return None
host = request.host
if ":" in host:
host = host.rsplit(":", 1)[0]
host = host.lower()
store = app.extensions.get("website_domains")
if not store:
return None
bucket = store.get_bucket(host)
if not bucket:
return None
storage = app.extensions["object_storage"]
if not storage.bucket_exists(bucket):
return _website_error_response(404, "Not Found")
website_config = storage.get_bucket_website(bucket)
if not website_config:
return _website_error_response(404, "Not Found")
index_doc = website_config.get("index_document", "index.html")
error_doc = website_config.get("error_document")
req_path = request.path.lstrip("/")
if not req_path or req_path.endswith("/"):
object_key = req_path + index_doc
else:
object_key = req_path
try:
obj_path = storage.get_object_path(bucket, object_key)
except (StorageError, OSError):
if object_key == req_path:
try:
obj_path = storage.get_object_path(bucket, req_path + "/" + index_doc)
object_key = req_path + "/" + index_doc
except (StorageError, OSError):
return _serve_website_error(storage, bucket, error_doc, 404)
else:
return _serve_website_error(storage, bucket, error_doc, 404)
content_type = mimetypes.guess_type(object_key)[0] or "application/octet-stream"
is_encrypted = False
try:
metadata = storage.get_object_metadata(bucket, object_key)
is_encrypted = "x-amz-server-side-encryption" in metadata
except (StorageError, OSError):
pass
if is_encrypted and hasattr(storage, "get_object_data"):
try:
data, _ = storage.get_object_data(bucket, object_key)
file_size = len(data)
except (StorageError, OSError):
return _website_error_response(500, "Internal Server Error")
else:
data = None
try:
stat = obj_path.stat()
file_size = stat.st_size
except OSError:
return _website_error_response(500, "Internal Server Error")
if request.method == "HEAD":
response = Response(status=200)
response.headers["Content-Length"] = file_size
response.headers["Content-Type"] = content_type
response.headers["Accept-Ranges"] = "bytes"
return response
from .s3_api import _parse_range_header
range_header = request.headers.get("Range")
if range_header:
ranges = _parse_range_header(range_header, file_size)
if ranges is None:
return Response(status=416, headers={"Content-Range": f"bytes */{file_size}"})
start, end = ranges[0]
length = end - start + 1
if data is not None:
partial_data = data[start:end + 1]
response = Response(partial_data, status=206, mimetype=content_type)
else:
def _stream_range(file_path, start_pos, length_to_read):
with file_path.open("rb") as f:
f.seek(start_pos)
remaining = length_to_read
while remaining > 0:
chunk = f.read(min(262144, remaining))
if not chunk:
break
remaining -= len(chunk)
yield chunk
response = Response(_stream_range(obj_path, start, length), status=206, mimetype=content_type, direct_passthrough=True)
response.headers["Content-Range"] = f"bytes {start}-{end}/{file_size}"
response.headers["Content-Length"] = length
response.headers["Accept-Ranges"] = "bytes"
return response
if data is not None:
response = Response(data, mimetype=content_type)
response.headers["Content-Length"] = file_size
response.headers["Accept-Ranges"] = "bytes"
return response
def _stream(file_path):
with file_path.open("rb") as f:
while True:
chunk = f.read(65536)
if not chunk:
break
yield chunk
response = Response(_stream(obj_path), mimetype=content_type, direct_passthrough=True)
response.headers["Content-Length"] = file_size
response.headers["Accept-Ranges"] = "bytes"
return response
def _serve_website_error(storage, bucket, error_doc_key, status_code):
if not error_doc_key:
return _website_error_response(status_code, "Not Found" if status_code == 404 else "Error")
try:
obj_path = storage.get_object_path(bucket, error_doc_key)
except (StorageError, OSError):
return _website_error_response(status_code, "Not Found")
content_type = mimetypes.guess_type(error_doc_key)[0] or "text/html"
is_encrypted = False
try:
metadata = storage.get_object_metadata(bucket, error_doc_key)
is_encrypted = "x-amz-server-side-encryption" in metadata
except (StorageError, OSError):
pass
if is_encrypted and hasattr(storage, "get_object_data"):
try:
data, _ = storage.get_object_data(bucket, error_doc_key)
response = Response(data, status=status_code, mimetype=content_type)
response.headers["Content-Length"] = len(data)
return response
except (StorageError, OSError):
return _website_error_response(status_code, "Not Found")
try:
data = obj_path.read_bytes()
response = Response(data, status=status_code, mimetype=content_type)
response.headers["Content-Length"] = len(data)
return response
except OSError:
return _website_error_response(status_code, "Not Found")
def _website_error_response(status_code, message):
safe_msg = html_module.escape(str(message))
safe_code = html_module.escape(str(status_code))
body = f"<html><head><title>{safe_code} {safe_msg}</title></head><body><h1>{safe_code} {safe_msg}</h1></body></html>"
return Response(body, status=status_code, mimetype="text/html")
@app.after_request
def _log_request_end(response):
duration_ms = 0.0
if hasattr(g, "request_started_at"):
duration_ms = (time.perf_counter() - g.request_started_at) * 1000
request_id = getattr(g, "request_id", uuid.uuid4().hex)
request_id = getattr(g, "request_id", f"{os.getpid():x}{next(_request_counter):012x}")
response.headers.setdefault("X-Request-ID", request_id)
if app.logger.isEnabledFor(logging.INFO):
app.logger.info(
"Request completed",
extra={
@@ -211,5 +742,22 @@ def _configure_logging(app: Flask) -> None:
},
)
response.headers["X-Request-Duration-ms"] = f"{duration_ms:.2f}"
response.headers["Server"] = "MyFISO"
response.headers["Server"] = "MyFSIO"
operation_metrics = app.extensions.get("operation_metrics")
if operation_metrics:
bytes_in = getattr(g, "request_bytes_in", 0)
bytes_out = response.content_length or 0
error_code = getattr(g, "s3_error_code", None)
endpoint_type = classify_endpoint(request.path)
operation_metrics.record_request(
method=request.method,
endpoint_type=endpoint_type,
status_code=response.status_code,
latency_ms=duration_ms,
bytes_in=bytes_in,
bytes_out=bytes_out,
error_code=error_code,
)
return response

265
app/access_logging.py Normal file
View File

@@ -0,0 +1,265 @@
from __future__ import annotations
import io
import json
import logging
import queue
import threading
import time
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class AccessLogEntry:
bucket_owner: str = "-"
bucket: str = "-"
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
remote_ip: str = "-"
requester: str = "-"
request_id: str = field(default_factory=lambda: uuid.uuid4().hex[:16].upper())
operation: str = "-"
key: str = "-"
request_uri: str = "-"
http_status: int = 200
error_code: str = "-"
bytes_sent: int = 0
object_size: int = 0
total_time_ms: int = 0
turn_around_time_ms: int = 0
referrer: str = "-"
user_agent: str = "-"
version_id: str = "-"
host_id: str = "-"
signature_version: str = "SigV4"
cipher_suite: str = "-"
authentication_type: str = "AuthHeader"
host_header: str = "-"
tls_version: str = "-"
def to_log_line(self) -> str:
time_str = self.timestamp.strftime("[%d/%b/%Y:%H:%M:%S %z]")
return (
f'{self.bucket_owner} {self.bucket} {time_str} {self.remote_ip} '
f'{self.requester} {self.request_id} {self.operation} {self.key} '
f'"{self.request_uri}" {self.http_status} {self.error_code or "-"} '
f'{self.bytes_sent or "-"} {self.object_size or "-"} {self.total_time_ms or "-"} '
f'{self.turn_around_time_ms or "-"} "{self.referrer}" "{self.user_agent}" {self.version_id}'
)
def to_dict(self) -> Dict[str, Any]:
return {
"bucket_owner": self.bucket_owner,
"bucket": self.bucket,
"timestamp": self.timestamp.isoformat(),
"remote_ip": self.remote_ip,
"requester": self.requester,
"request_id": self.request_id,
"operation": self.operation,
"key": self.key,
"request_uri": self.request_uri,
"http_status": self.http_status,
"error_code": self.error_code,
"bytes_sent": self.bytes_sent,
"object_size": self.object_size,
"total_time_ms": self.total_time_ms,
"referrer": self.referrer,
"user_agent": self.user_agent,
"version_id": self.version_id,
}
@dataclass
class LoggingConfiguration:
target_bucket: str
target_prefix: str = ""
enabled: bool = True
def to_dict(self) -> Dict[str, Any]:
return {
"LoggingEnabled": {
"TargetBucket": self.target_bucket,
"TargetPrefix": self.target_prefix,
}
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Optional["LoggingConfiguration"]:
logging_enabled = data.get("LoggingEnabled")
if not logging_enabled:
return None
return cls(
target_bucket=logging_enabled.get("TargetBucket", ""),
target_prefix=logging_enabled.get("TargetPrefix", ""),
enabled=True,
)
class AccessLoggingService:
def __init__(self, storage_root: Path, flush_interval: int = 60, max_buffer_size: int = 1000):
self.storage_root = storage_root
self.flush_interval = flush_interval
self.max_buffer_size = max_buffer_size
self._configs: Dict[str, LoggingConfiguration] = {}
self._buffer: Dict[str, List[AccessLogEntry]] = {}
self._buffer_lock = threading.Lock()
self._shutdown = threading.Event()
self._storage = None
self._flush_thread = threading.Thread(target=self._flush_loop, name="access-log-flush", daemon=True)
self._flush_thread.start()
def set_storage(self, storage: Any) -> None:
self._storage = storage
def _config_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "logging.json"
def get_bucket_logging(self, bucket_name: str) -> Optional[LoggingConfiguration]:
if bucket_name in self._configs:
return self._configs[bucket_name]
config_path = self._config_path(bucket_name)
if not config_path.exists():
return None
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
config = LoggingConfiguration.from_dict(data)
if config:
self._configs[bucket_name] = config
return config
except (json.JSONDecodeError, OSError) as e:
logger.warning(f"Failed to load logging config for {bucket_name}: {e}")
return None
def set_bucket_logging(self, bucket_name: str, config: LoggingConfiguration) -> None:
config_path = self._config_path(bucket_name)
config_path.parent.mkdir(parents=True, exist_ok=True)
config_path.write_text(json.dumps(config.to_dict(), indent=2), encoding="utf-8")
self._configs[bucket_name] = config
def delete_bucket_logging(self, bucket_name: str) -> None:
config_path = self._config_path(bucket_name)
try:
if config_path.exists():
config_path.unlink()
except OSError:
pass
self._configs.pop(bucket_name, None)
def log_request(
self,
bucket_name: str,
*,
operation: str,
key: str = "-",
remote_ip: str = "-",
requester: str = "-",
request_uri: str = "-",
http_status: int = 200,
error_code: str = "",
bytes_sent: int = 0,
object_size: int = 0,
total_time_ms: int = 0,
referrer: str = "-",
user_agent: str = "-",
version_id: str = "-",
request_id: str = "",
) -> None:
config = self.get_bucket_logging(bucket_name)
if not config or not config.enabled:
return
entry = AccessLogEntry(
bucket_owner="local-owner",
bucket=bucket_name,
remote_ip=remote_ip,
requester=requester,
request_id=request_id or uuid.uuid4().hex[:16].upper(),
operation=operation,
key=key,
request_uri=request_uri,
http_status=http_status,
error_code=error_code,
bytes_sent=bytes_sent,
object_size=object_size,
total_time_ms=total_time_ms,
referrer=referrer,
user_agent=user_agent,
version_id=version_id,
)
target_key = f"{config.target_bucket}:{config.target_prefix}"
should_flush = False
with self._buffer_lock:
if target_key not in self._buffer:
self._buffer[target_key] = []
self._buffer[target_key].append(entry)
should_flush = len(self._buffer[target_key]) >= self.max_buffer_size
if should_flush:
self._flush_buffer(target_key)
def _flush_loop(self) -> None:
while not self._shutdown.is_set():
self._shutdown.wait(timeout=self.flush_interval)
if not self._shutdown.is_set():
self._flush_all()
def _flush_all(self) -> None:
with self._buffer_lock:
targets = list(self._buffer.keys())
for target_key in targets:
self._flush_buffer(target_key)
def _flush_buffer(self, target_key: str) -> None:
with self._buffer_lock:
entries = self._buffer.pop(target_key, [])
if not entries or not self._storage:
return
try:
bucket_name, prefix = target_key.split(":", 1)
except ValueError:
logger.error(f"Invalid target key: {target_key}")
return
now = datetime.now(timezone.utc)
log_key = f"{prefix}{now.strftime('%Y-%m-%d-%H-%M-%S')}-{uuid.uuid4().hex[:8]}"
log_content = "\n".join(entry.to_log_line() for entry in entries) + "\n"
try:
stream = io.BytesIO(log_content.encode("utf-8"))
self._storage.put_object(bucket_name, log_key, stream, enforce_quota=False)
logger.info(f"Flushed {len(entries)} access log entries to {bucket_name}/{log_key}")
except Exception as e:
logger.error(f"Failed to write access log to {bucket_name}/{log_key}: {e}")
with self._buffer_lock:
if target_key not in self._buffer:
self._buffer[target_key] = []
self._buffer[target_key] = entries + self._buffer[target_key]
def flush(self) -> None:
self._flush_all()
def shutdown(self) -> None:
self._shutdown.set()
self._flush_all()
self._flush_thread.join(timeout=5.0)
def get_stats(self) -> Dict[str, Any]:
with self._buffer_lock:
buffered = sum(len(entries) for entries in self._buffer.values())
return {
"buffered_entries": buffered,
"target_buckets": len(self._buffer),
}

204
app/acl.py Normal file
View File

@@ -0,0 +1,204 @@
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
ACL_PERMISSION_FULL_CONTROL = "FULL_CONTROL"
ACL_PERMISSION_WRITE = "WRITE"
ACL_PERMISSION_WRITE_ACP = "WRITE_ACP"
ACL_PERMISSION_READ = "READ"
ACL_PERMISSION_READ_ACP = "READ_ACP"
ALL_PERMISSIONS = {
ACL_PERMISSION_FULL_CONTROL,
ACL_PERMISSION_WRITE,
ACL_PERMISSION_WRITE_ACP,
ACL_PERMISSION_READ,
ACL_PERMISSION_READ_ACP,
}
PERMISSION_TO_ACTIONS = {
ACL_PERMISSION_FULL_CONTROL: {"read", "write", "delete", "list", "share"},
ACL_PERMISSION_WRITE: {"write", "delete"},
ACL_PERMISSION_WRITE_ACP: {"share"},
ACL_PERMISSION_READ: {"read", "list"},
ACL_PERMISSION_READ_ACP: {"share"},
}
GRANTEE_ALL_USERS = "*"
GRANTEE_AUTHENTICATED_USERS = "authenticated"
@dataclass
class AclGrant:
grantee: str
permission: str
def to_dict(self) -> Dict[str, str]:
return {"grantee": self.grantee, "permission": self.permission}
@classmethod
def from_dict(cls, data: Dict[str, str]) -> "AclGrant":
return cls(grantee=data["grantee"], permission=data["permission"])
@dataclass
class Acl:
owner: str
grants: List[AclGrant] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
"owner": self.owner,
"grants": [g.to_dict() for g in self.grants],
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "Acl":
return cls(
owner=data.get("owner", ""),
grants=[AclGrant.from_dict(g) for g in data.get("grants", [])],
)
def get_allowed_actions(self, principal_id: Optional[str], is_authenticated: bool = True) -> Set[str]:
actions: Set[str] = set()
if principal_id and principal_id == self.owner:
actions.update(PERMISSION_TO_ACTIONS[ACL_PERMISSION_FULL_CONTROL])
for grant in self.grants:
if grant.grantee == GRANTEE_ALL_USERS:
actions.update(PERMISSION_TO_ACTIONS.get(grant.permission, set()))
elif grant.grantee == GRANTEE_AUTHENTICATED_USERS and is_authenticated:
actions.update(PERMISSION_TO_ACTIONS.get(grant.permission, set()))
elif principal_id and grant.grantee == principal_id:
actions.update(PERMISSION_TO_ACTIONS.get(grant.permission, set()))
return actions
CANNED_ACLS = {
"private": lambda owner: Acl(
owner=owner,
grants=[AclGrant(grantee=owner, permission=ACL_PERMISSION_FULL_CONTROL)],
),
"public-read": lambda owner: Acl(
owner=owner,
grants=[
AclGrant(grantee=owner, permission=ACL_PERMISSION_FULL_CONTROL),
AclGrant(grantee=GRANTEE_ALL_USERS, permission=ACL_PERMISSION_READ),
],
),
"public-read-write": lambda owner: Acl(
owner=owner,
grants=[
AclGrant(grantee=owner, permission=ACL_PERMISSION_FULL_CONTROL),
AclGrant(grantee=GRANTEE_ALL_USERS, permission=ACL_PERMISSION_READ),
AclGrant(grantee=GRANTEE_ALL_USERS, permission=ACL_PERMISSION_WRITE),
],
),
"authenticated-read": lambda owner: Acl(
owner=owner,
grants=[
AclGrant(grantee=owner, permission=ACL_PERMISSION_FULL_CONTROL),
AclGrant(grantee=GRANTEE_AUTHENTICATED_USERS, permission=ACL_PERMISSION_READ),
],
),
"bucket-owner-read": lambda owner: Acl(
owner=owner,
grants=[
AclGrant(grantee=owner, permission=ACL_PERMISSION_FULL_CONTROL),
],
),
"bucket-owner-full-control": lambda owner: Acl(
owner=owner,
grants=[
AclGrant(grantee=owner, permission=ACL_PERMISSION_FULL_CONTROL),
],
),
}
def create_canned_acl(canned_acl: str, owner: str) -> Acl:
factory = CANNED_ACLS.get(canned_acl)
if not factory:
return CANNED_ACLS["private"](owner)
return factory(owner)
class AclService:
def __init__(self, storage_root: Path):
self.storage_root = storage_root
self._bucket_acl_cache: Dict[str, Acl] = {}
def _bucket_acl_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / ".acl.json"
def get_bucket_acl(self, bucket_name: str) -> Optional[Acl]:
if bucket_name in self._bucket_acl_cache:
return self._bucket_acl_cache[bucket_name]
acl_path = self._bucket_acl_path(bucket_name)
if not acl_path.exists():
return None
try:
data = json.loads(acl_path.read_text(encoding="utf-8"))
acl = Acl.from_dict(data)
self._bucket_acl_cache[bucket_name] = acl
return acl
except (OSError, json.JSONDecodeError):
return None
def set_bucket_acl(self, bucket_name: str, acl: Acl) -> None:
acl_path = self._bucket_acl_path(bucket_name)
acl_path.parent.mkdir(parents=True, exist_ok=True)
acl_path.write_text(json.dumps(acl.to_dict(), indent=2), encoding="utf-8")
self._bucket_acl_cache[bucket_name] = acl
def set_bucket_canned_acl(self, bucket_name: str, canned_acl: str, owner: str) -> Acl:
acl = create_canned_acl(canned_acl, owner)
self.set_bucket_acl(bucket_name, acl)
return acl
def delete_bucket_acl(self, bucket_name: str) -> None:
acl_path = self._bucket_acl_path(bucket_name)
if acl_path.exists():
acl_path.unlink()
self._bucket_acl_cache.pop(bucket_name, None)
def evaluate_bucket_acl(
self,
bucket_name: str,
principal_id: Optional[str],
action: str,
is_authenticated: bool = True,
) -> bool:
acl = self.get_bucket_acl(bucket_name)
if not acl:
return False
allowed_actions = acl.get_allowed_actions(principal_id, is_authenticated)
return action in allowed_actions
def get_object_acl(self, bucket_name: str, object_key: str, object_metadata: Dict[str, Any]) -> Optional[Acl]:
acl_data = object_metadata.get("__acl__")
if not acl_data:
return None
try:
return Acl.from_dict(acl_data)
except (TypeError, KeyError):
return None
def create_object_acl_metadata(self, acl: Acl) -> Dict[str, Any]:
return {"__acl__": acl.to_dict()}
def evaluate_object_acl(
self,
object_metadata: Dict[str, Any],
principal_id: Optional[str],
action: str,
is_authenticated: bool = True,
) -> bool:
acl = self.get_object_acl("", "", object_metadata)
if not acl:
return False
allowed_actions = acl.get_allowed_actions(principal_id, is_authenticated)
return action in allowed_actions

984
app/admin_api.py Normal file
View File

@@ -0,0 +1,984 @@
from __future__ import annotations
import ipaddress
import json
import logging
import re
import socket
import time
from typing import Any, Dict, Optional, Tuple
from urllib.parse import urlparse
import requests
from flask import Blueprint, Response, current_app, jsonify, request
from .connections import ConnectionStore
from .extensions import limiter
from .gc import GarbageCollector
from .integrity import IntegrityChecker
from .iam import IamError, Principal
from .replication import ReplicationManager
from .site_registry import PeerSite, SiteInfo, SiteRegistry
from .website_domains import WebsiteDomainStore, normalize_domain, is_valid_domain
def _is_safe_url(url: str, allow_internal: bool = False) -> bool:
"""Check if a URL is safe to make requests to (not internal/private).
Args:
url: The URL to check.
allow_internal: If True, allows internal/private IP addresses.
Use for self-hosted deployments on internal networks.
"""
try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return False
cloud_metadata_hosts = {
"metadata.google.internal",
"169.254.169.254",
}
if hostname.lower() in cloud_metadata_hosts:
return False
if allow_internal:
return True
blocked_hosts = {
"localhost",
"127.0.0.1",
"0.0.0.0",
"::1",
"[::1]",
}
if hostname.lower() in blocked_hosts:
return False
try:
resolved_ip = socket.gethostbyname(hostname)
ip = ipaddress.ip_address(resolved_ip)
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
return False
except (socket.gaierror, ValueError):
return False
return True
except Exception:
return False
def _validate_endpoint(endpoint: str) -> Optional[str]:
"""Validate endpoint URL format. Returns error message or None."""
try:
parsed = urlparse(endpoint)
if not parsed.scheme or parsed.scheme not in ("http", "https"):
return "Endpoint must be http or https URL"
if not parsed.netloc:
return "Endpoint must have a host"
return None
except Exception:
return "Invalid endpoint URL"
def _validate_priority(priority: Any) -> Optional[str]:
"""Validate priority value. Returns error message or None."""
try:
p = int(priority)
if p < 0 or p > 1000:
return "Priority must be between 0 and 1000"
return None
except (TypeError, ValueError):
return "Priority must be an integer"
def _validate_region(region: str) -> Optional[str]:
"""Validate region format. Returns error message or None."""
if not re.match(r"^[a-z]{2,}-[a-z]+-\d+$", region):
return "Region must match format like us-east-1"
return None
def _validate_site_id(site_id: str) -> Optional[str]:
"""Validate site_id format. Returns error message or None."""
if not site_id or len(site_id) > 63:
return "site_id must be 1-63 characters"
if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9_-]*$', site_id):
return "site_id must start with alphanumeric and contain only alphanumeric, hyphens, underscores"
return None
logger = logging.getLogger(__name__)
admin_api_bp = Blueprint("admin_api", __name__, url_prefix="/admin")
def _require_principal() -> Tuple[Optional[Principal], Optional[Tuple[Dict[str, Any], int]]]:
from .s3_api import _require_principal as s3_require_principal
return s3_require_principal()
def _require_admin() -> Tuple[Optional[Principal], Optional[Tuple[Dict[str, Any], int]]]:
principal, error = _require_principal()
if error:
return None, error
try:
_iam().authorize(principal, None, "iam:*")
return principal, None
except IamError:
return None, _json_error("AccessDenied", "Admin access required", 403)
def _site_registry() -> SiteRegistry:
return current_app.extensions["site_registry"]
def _connections() -> ConnectionStore:
return current_app.extensions["connections"]
def _replication() -> ReplicationManager:
return current_app.extensions["replication"]
def _iam():
return current_app.extensions["iam"]
def _json_error(code: str, message: str, status: int) -> Tuple[Dict[str, Any], int]:
return {"error": {"code": code, "message": message}}, status
def _get_admin_rate_limit() -> str:
return current_app.config.get("RATE_LIMIT_ADMIN", "60 per minute")
@admin_api_bp.route("/site", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def get_local_site():
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
local_site = registry.get_local_site()
if local_site:
return jsonify(local_site.to_dict())
config_site_id = current_app.config.get("SITE_ID")
config_endpoint = current_app.config.get("SITE_ENDPOINT")
if config_site_id:
return jsonify({
"site_id": config_site_id,
"endpoint": config_endpoint or "",
"region": current_app.config.get("SITE_REGION", "us-east-1"),
"priority": current_app.config.get("SITE_PRIORITY", 100),
"display_name": config_site_id,
"source": "environment",
})
return _json_error("NotFound", "Local site not configured", 404)
@admin_api_bp.route("/site", methods=["PUT"])
@limiter.limit(lambda: _get_admin_rate_limit())
def update_local_site():
principal, error = _require_admin()
if error:
return error
payload = request.get_json(silent=True) or {}
site_id = payload.get("site_id")
endpoint = payload.get("endpoint")
if not site_id:
return _json_error("ValidationError", "site_id is required", 400)
site_id_error = _validate_site_id(site_id)
if site_id_error:
return _json_error("ValidationError", site_id_error, 400)
if endpoint:
endpoint_error = _validate_endpoint(endpoint)
if endpoint_error:
return _json_error("ValidationError", endpoint_error, 400)
if "priority" in payload:
priority_error = _validate_priority(payload["priority"])
if priority_error:
return _json_error("ValidationError", priority_error, 400)
if "region" in payload:
region_error = _validate_region(payload["region"])
if region_error:
return _json_error("ValidationError", region_error, 400)
registry = _site_registry()
existing = registry.get_local_site()
site = SiteInfo(
site_id=site_id,
endpoint=endpoint or "",
region=payload.get("region", "us-east-1"),
priority=payload.get("priority", 100),
display_name=payload.get("display_name", site_id),
created_at=existing.created_at if existing else None,
)
registry.set_local_site(site)
logger.info("Local site updated", extra={"site_id": site_id, "principal": principal.access_key})
return jsonify(site.to_dict())
@admin_api_bp.route("/sites", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def list_all_sites():
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
local = registry.get_local_site()
peers = registry.list_peers()
result = {
"local": local.to_dict() if local else None,
"peers": [peer.to_dict() for peer in peers],
"total_peers": len(peers),
}
return jsonify(result)
@admin_api_bp.route("/sites", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def register_peer_site():
principal, error = _require_admin()
if error:
return error
payload = request.get_json(silent=True) or {}
site_id = payload.get("site_id")
endpoint = payload.get("endpoint")
if not site_id:
return _json_error("ValidationError", "site_id is required", 400)
site_id_error = _validate_site_id(site_id)
if site_id_error:
return _json_error("ValidationError", site_id_error, 400)
if not endpoint:
return _json_error("ValidationError", "endpoint is required", 400)
endpoint_error = _validate_endpoint(endpoint)
if endpoint_error:
return _json_error("ValidationError", endpoint_error, 400)
region = payload.get("region", "us-east-1")
region_error = _validate_region(region)
if region_error:
return _json_error("ValidationError", region_error, 400)
priority = payload.get("priority", 100)
priority_error = _validate_priority(priority)
if priority_error:
return _json_error("ValidationError", priority_error, 400)
registry = _site_registry()
if registry.get_peer(site_id):
return _json_error("AlreadyExists", f"Peer site '{site_id}' already exists", 409)
connection_id = payload.get("connection_id")
if connection_id:
if not _connections().get(connection_id):
return _json_error("ValidationError", f"Connection '{connection_id}' not found", 400)
peer = PeerSite(
site_id=site_id,
endpoint=endpoint,
region=region,
priority=int(priority),
display_name=payload.get("display_name", site_id),
connection_id=connection_id,
)
registry.add_peer(peer)
logger.info("Peer site registered", extra={"site_id": site_id, "principal": principal.access_key})
return jsonify(peer.to_dict()), 201
@admin_api_bp.route("/sites/<site_id>", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def get_peer_site(site_id: str):
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
peer = registry.get_peer(site_id)
if not peer:
return _json_error("NotFound", f"Peer site '{site_id}' not found", 404)
return jsonify(peer.to_dict())
@admin_api_bp.route("/sites/<site_id>", methods=["PUT"])
@limiter.limit(lambda: _get_admin_rate_limit())
def update_peer_site(site_id: str):
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
existing = registry.get_peer(site_id)
if not existing:
return _json_error("NotFound", f"Peer site '{site_id}' not found", 404)
payload = request.get_json(silent=True) or {}
if "endpoint" in payload:
endpoint_error = _validate_endpoint(payload["endpoint"])
if endpoint_error:
return _json_error("ValidationError", endpoint_error, 400)
if "priority" in payload:
priority_error = _validate_priority(payload["priority"])
if priority_error:
return _json_error("ValidationError", priority_error, 400)
if "region" in payload:
region_error = _validate_region(payload["region"])
if region_error:
return _json_error("ValidationError", region_error, 400)
if "connection_id" in payload:
if payload["connection_id"] and not _connections().get(payload["connection_id"]):
return _json_error("ValidationError", f"Connection '{payload['connection_id']}' not found", 400)
peer = PeerSite(
site_id=site_id,
endpoint=payload.get("endpoint", existing.endpoint),
region=payload.get("region", existing.region),
priority=payload.get("priority", existing.priority),
display_name=payload.get("display_name", existing.display_name),
connection_id=payload.get("connection_id", existing.connection_id),
created_at=existing.created_at,
is_healthy=existing.is_healthy,
last_health_check=existing.last_health_check,
)
registry.update_peer(peer)
logger.info("Peer site updated", extra={"site_id": site_id, "principal": principal.access_key})
return jsonify(peer.to_dict())
@admin_api_bp.route("/sites/<site_id>", methods=["DELETE"])
@limiter.limit(lambda: _get_admin_rate_limit())
def delete_peer_site(site_id: str):
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
if not registry.delete_peer(site_id):
return _json_error("NotFound", f"Peer site '{site_id}' not found", 404)
logger.info("Peer site deleted", extra={"site_id": site_id, "principal": principal.access_key})
return Response(status=204)
@admin_api_bp.route("/sites/<site_id>/health", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def check_peer_health(site_id: str):
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
peer = registry.get_peer(site_id)
if not peer:
return _json_error("NotFound", f"Peer site '{site_id}' not found", 404)
is_healthy = False
error_message = None
if peer.connection_id:
connection = _connections().get(peer.connection_id)
if connection:
is_healthy = _replication().check_endpoint_health(connection)
else:
error_message = f"Connection '{peer.connection_id}' not found"
else:
error_message = "No connection configured for this peer"
registry.update_health(site_id, is_healthy)
result = {
"site_id": site_id,
"is_healthy": is_healthy,
"checked_at": time.time(),
}
if error_message:
result["error"] = error_message
return jsonify(result)
@admin_api_bp.route("/topology", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def get_topology():
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
local = registry.get_local_site()
peers = registry.list_peers()
sites = []
if local:
sites.append({
**local.to_dict(),
"is_local": True,
"is_healthy": True,
})
for peer in peers:
sites.append({
**peer.to_dict(),
"is_local": False,
})
sites.sort(key=lambda s: s.get("priority", 100))
return jsonify({
"sites": sites,
"total": len(sites),
"healthy_count": sum(1 for s in sites if s.get("is_healthy")),
})
@admin_api_bp.route("/sites/<site_id>/bidirectional-status", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def check_bidirectional_status(site_id: str):
principal, error = _require_admin()
if error:
return error
registry = _site_registry()
peer = registry.get_peer(site_id)
if not peer:
return _json_error("NotFound", f"Peer site '{site_id}' not found", 404)
local_site = registry.get_local_site()
replication = _replication()
local_rules = replication.list_rules()
local_bidir_rules = []
for rule in local_rules:
if rule.target_connection_id == peer.connection_id and rule.mode == "bidirectional":
local_bidir_rules.append({
"bucket_name": rule.bucket_name,
"target_bucket": rule.target_bucket,
"enabled": rule.enabled,
})
result = {
"site_id": site_id,
"local_site_id": local_site.site_id if local_site else None,
"local_endpoint": local_site.endpoint if local_site else None,
"local_bidirectional_rules": local_bidir_rules,
"local_site_sync_enabled": current_app.config.get("SITE_SYNC_ENABLED", False),
"remote_status": None,
"issues": [],
"is_fully_configured": False,
}
if not local_site or not local_site.site_id:
result["issues"].append({
"code": "NO_LOCAL_SITE_ID",
"message": "Local site identity not configured",
"severity": "error",
})
if not local_site or not local_site.endpoint:
result["issues"].append({
"code": "NO_LOCAL_ENDPOINT",
"message": "Local site endpoint not configured (remote site cannot reach back)",
"severity": "error",
})
if not peer.connection_id:
result["issues"].append({
"code": "NO_CONNECTION",
"message": "No connection configured for this peer",
"severity": "error",
})
return jsonify(result)
connection = _connections().get(peer.connection_id)
if not connection:
result["issues"].append({
"code": "CONNECTION_NOT_FOUND",
"message": f"Connection '{peer.connection_id}' not found",
"severity": "error",
})
return jsonify(result)
if not local_bidir_rules:
result["issues"].append({
"code": "NO_LOCAL_BIDIRECTIONAL_RULES",
"message": "No bidirectional replication rules configured on this site",
"severity": "warning",
})
if not result["local_site_sync_enabled"]:
result["issues"].append({
"code": "SITE_SYNC_DISABLED",
"message": "Site sync worker is disabled (SITE_SYNC_ENABLED=false). Pull operations will not work.",
"severity": "warning",
})
if not replication.check_endpoint_health(connection):
result["issues"].append({
"code": "REMOTE_UNREACHABLE",
"message": "Remote endpoint is not reachable",
"severity": "error",
})
return jsonify(result)
allow_internal = current_app.config.get("ALLOW_INTERNAL_ENDPOINTS", False)
if not _is_safe_url(peer.endpoint, allow_internal=allow_internal):
result["issues"].append({
"code": "ENDPOINT_NOT_ALLOWED",
"message": "Peer endpoint points to cloud metadata service (SSRF protection)",
"severity": "error",
})
return jsonify(result)
try:
admin_url = peer.endpoint.rstrip("/") + "/admin/sites"
resp = requests.get(
admin_url,
timeout=10,
headers={
"Accept": "application/json",
"X-Access-Key": connection.access_key,
"X-Secret-Key": connection.secret_key,
},
)
if resp.status_code == 200:
try:
remote_data = resp.json()
if not isinstance(remote_data, dict):
raise ValueError("Expected JSON object")
remote_local = remote_data.get("local")
if remote_local is not None and not isinstance(remote_local, dict):
raise ValueError("Expected 'local' to be an object")
remote_peers = remote_data.get("peers", [])
if not isinstance(remote_peers, list):
raise ValueError("Expected 'peers' to be a list")
except (ValueError, json.JSONDecodeError) as e:
logger.warning("Invalid JSON from remote admin API: %s", e)
result["remote_status"] = {"reachable": True, "invalid_response": True}
result["issues"].append({
"code": "REMOTE_INVALID_RESPONSE",
"message": "Remote admin API returned invalid JSON",
"severity": "warning",
})
return jsonify(result)
result["remote_status"] = {
"reachable": True,
"local_site": remote_local,
"site_sync_enabled": None,
"has_peer_for_us": False,
"peer_connection_configured": False,
"has_bidirectional_rules_for_us": False,
}
for rp in remote_peers:
if not isinstance(rp, dict):
continue
if local_site and (
rp.get("site_id") == local_site.site_id or
rp.get("endpoint") == local_site.endpoint
):
result["remote_status"]["has_peer_for_us"] = True
result["remote_status"]["peer_connection_configured"] = bool(rp.get("connection_id"))
break
if not result["remote_status"]["has_peer_for_us"]:
result["issues"].append({
"code": "REMOTE_NO_PEER_FOR_US",
"message": "Remote site does not have this site registered as a peer",
"severity": "error",
})
elif not result["remote_status"]["peer_connection_configured"]:
result["issues"].append({
"code": "REMOTE_NO_CONNECTION_FOR_US",
"message": "Remote site has us as peer but no connection configured (cannot push back)",
"severity": "error",
})
elif resp.status_code == 401 or resp.status_code == 403:
result["remote_status"] = {
"reachable": True,
"admin_access_denied": True,
}
result["issues"].append({
"code": "REMOTE_ADMIN_ACCESS_DENIED",
"message": "Cannot verify remote configuration (admin access denied)",
"severity": "warning",
})
else:
result["remote_status"] = {
"reachable": True,
"admin_api_error": resp.status_code,
}
result["issues"].append({
"code": "REMOTE_ADMIN_API_ERROR",
"message": f"Remote admin API returned status {resp.status_code}",
"severity": "warning",
})
except requests.RequestException as e:
logger.warning("Remote admin API unreachable: %s", e)
result["remote_status"] = {
"reachable": False,
"error": "Connection failed",
}
result["issues"].append({
"code": "REMOTE_ADMIN_UNREACHABLE",
"message": "Could not reach remote admin API",
"severity": "warning",
})
except Exception as e:
logger.warning("Error checking remote bidirectional status: %s", e, exc_info=True)
result["issues"].append({
"code": "VERIFICATION_ERROR",
"message": "Internal error during verification",
"severity": "warning",
})
error_issues = [i for i in result["issues"] if i["severity"] == "error"]
result["is_fully_configured"] = len(error_issues) == 0 and len(local_bidir_rules) > 0
return jsonify(result)
def _website_domains() -> WebsiteDomainStore:
return current_app.extensions["website_domains"]
def _storage():
return current_app.extensions["object_storage"]
def _require_iam_action(action: str):
principal, error = _require_principal()
if error:
return None, error
try:
_iam().authorize(principal, None, action)
return principal, None
except IamError:
return None, _json_error("AccessDenied", f"Requires {action} permission", 403)
@admin_api_bp.route("/iam/users", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_list_users():
principal, error = _require_iam_action("iam:list_users")
if error:
return error
return jsonify({"users": _iam().list_users()})
@admin_api_bp.route("/iam/users/<identifier>", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_get_user(identifier):
principal, error = _require_iam_action("iam:get_user")
if error:
return error
try:
user_id = _iam().resolve_user_id(identifier)
return jsonify(_iam().get_user_by_id(user_id))
except IamError as exc:
return _json_error("NotFound", str(exc), 404)
@admin_api_bp.route("/iam/users/<identifier>/policies", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_get_user_policies(identifier):
principal, error = _require_iam_action("iam:get_policy")
if error:
return error
try:
return jsonify({"policies": _iam().get_user_policies(identifier)})
except IamError as exc:
return _json_error("NotFound", str(exc), 404)
@admin_api_bp.route("/iam/users/<identifier>/keys", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_create_access_key(identifier):
principal, error = _require_iam_action("iam:create_key")
if error:
return error
try:
result = _iam().create_access_key(identifier)
logger.info("Access key created for %s by %s", identifier, principal.access_key)
return jsonify(result), 201
except IamError as exc:
return _json_error("InvalidRequest", str(exc), 400)
@admin_api_bp.route("/iam/users/<identifier>/keys/<access_key>", methods=["DELETE"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_delete_access_key(identifier, access_key):
principal, error = _require_iam_action("iam:delete_key")
if error:
return error
try:
_iam().delete_access_key(access_key)
logger.info("Access key %s deleted by %s", access_key, principal.access_key)
return "", 204
except IamError as exc:
return _json_error("InvalidRequest", str(exc), 400)
@admin_api_bp.route("/iam/users/<identifier>/disable", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_disable_user(identifier):
principal, error = _require_iam_action("iam:disable_user")
if error:
return error
try:
_iam().disable_user(identifier)
logger.info("User %s disabled by %s", identifier, principal.access_key)
return jsonify({"status": "disabled"})
except IamError as exc:
return _json_error("InvalidRequest", str(exc), 400)
@admin_api_bp.route("/iam/users/<identifier>/enable", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def iam_enable_user(identifier):
principal, error = _require_iam_action("iam:disable_user")
if error:
return error
try:
_iam().enable_user(identifier)
logger.info("User %s enabled by %s", identifier, principal.access_key)
return jsonify({"status": "enabled"})
except IamError as exc:
return _json_error("InvalidRequest", str(exc), 400)
@admin_api_bp.route("/website-domains", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def list_website_domains():
principal, error = _require_admin()
if error:
return error
if not current_app.config.get("WEBSITE_HOSTING_ENABLED", False):
return _json_error("InvalidRequest", "Website hosting is not enabled", 400)
return jsonify(_website_domains().list_all())
@admin_api_bp.route("/website-domains", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def create_website_domain():
principal, error = _require_admin()
if error:
return error
if not current_app.config.get("WEBSITE_HOSTING_ENABLED", False):
return _json_error("InvalidRequest", "Website hosting is not enabled", 400)
payload = request.get_json(silent=True) or {}
domain = normalize_domain(payload.get("domain") or "")
bucket = (payload.get("bucket") or "").strip()
if not domain:
return _json_error("ValidationError", "domain is required", 400)
if not is_valid_domain(domain):
return _json_error("ValidationError", f"Invalid domain: '{domain}'", 400)
if not bucket:
return _json_error("ValidationError", "bucket is required", 400)
storage = _storage()
if not storage.bucket_exists(bucket):
return _json_error("NoSuchBucket", f"Bucket '{bucket}' does not exist", 404)
store = _website_domains()
existing = store.get_bucket(domain)
if existing:
return _json_error("Conflict", f"Domain '{domain}' is already mapped to bucket '{existing}'", 409)
store.set_mapping(domain, bucket)
logger.info("Website domain mapping created: %s -> %s", domain, bucket)
return jsonify({"domain": domain, "bucket": bucket}), 201
@admin_api_bp.route("/website-domains/<domain>", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def get_website_domain(domain: str):
principal, error = _require_admin()
if error:
return error
if not current_app.config.get("WEBSITE_HOSTING_ENABLED", False):
return _json_error("InvalidRequest", "Website hosting is not enabled", 400)
domain = normalize_domain(domain)
bucket = _website_domains().get_bucket(domain)
if not bucket:
return _json_error("NotFound", f"No mapping found for domain '{domain}'", 404)
return jsonify({"domain": domain, "bucket": bucket})
@admin_api_bp.route("/website-domains/<domain>", methods=["PUT"])
@limiter.limit(lambda: _get_admin_rate_limit())
def update_website_domain(domain: str):
principal, error = _require_admin()
if error:
return error
if not current_app.config.get("WEBSITE_HOSTING_ENABLED", False):
return _json_error("InvalidRequest", "Website hosting is not enabled", 400)
domain = normalize_domain(domain)
payload = request.get_json(silent=True) or {}
bucket = (payload.get("bucket") or "").strip()
if not bucket:
return _json_error("ValidationError", "bucket is required", 400)
storage = _storage()
if not storage.bucket_exists(bucket):
return _json_error("NoSuchBucket", f"Bucket '{bucket}' does not exist", 404)
store = _website_domains()
if not store.get_bucket(domain):
return _json_error("NotFound", f"No mapping found for domain '{domain}'", 404)
store.set_mapping(domain, bucket)
logger.info("Website domain mapping updated: %s -> %s", domain, bucket)
return jsonify({"domain": domain, "bucket": bucket})
@admin_api_bp.route("/website-domains/<domain>", methods=["DELETE"])
@limiter.limit(lambda: _get_admin_rate_limit())
def delete_website_domain(domain: str):
principal, error = _require_admin()
if error:
return error
if not current_app.config.get("WEBSITE_HOSTING_ENABLED", False):
return _json_error("InvalidRequest", "Website hosting is not enabled", 400)
domain = normalize_domain(domain)
if not _website_domains().delete_mapping(domain):
return _json_error("NotFound", f"No mapping found for domain '{domain}'", 404)
logger.info("Website domain mapping deleted: %s", domain)
return Response(status=204)
def _gc() -> Optional[GarbageCollector]:
return current_app.extensions.get("gc")
@admin_api_bp.route("/gc/status", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def gc_status():
principal, error = _require_admin()
if error:
return error
gc = _gc()
if not gc:
return jsonify({"enabled": False, "message": "GC is not enabled. Set GC_ENABLED=true to enable."})
return jsonify(gc.get_status())
@admin_api_bp.route("/gc/run", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def gc_run_now():
principal, error = _require_admin()
if error:
return error
gc = _gc()
if not gc:
return _json_error("InvalidRequest", "GC is not enabled", 400)
payload = request.get_json(silent=True) or {}
started = gc.run_async(dry_run=payload.get("dry_run"))
logger.info("GC manual run by %s", principal.access_key)
if not started:
return _json_error("Conflict", "GC is already in progress", 409)
return jsonify({"status": "started"})
@admin_api_bp.route("/gc/history", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def gc_history():
principal, error = _require_admin()
if error:
return error
gc = _gc()
if not gc:
return jsonify({"executions": []})
limit = min(int(request.args.get("limit", 50)), 200)
offset = int(request.args.get("offset", 0))
records = gc.get_history(limit=limit, offset=offset)
return jsonify({"executions": records})
def _integrity() -> Optional[IntegrityChecker]:
return current_app.extensions.get("integrity")
@admin_api_bp.route("/integrity/status", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def integrity_status():
principal, error = _require_admin()
if error:
return error
checker = _integrity()
if not checker:
return jsonify({"enabled": False, "message": "Integrity checker is not enabled. Set INTEGRITY_ENABLED=true to enable."})
return jsonify(checker.get_status())
@admin_api_bp.route("/integrity/run", methods=["POST"])
@limiter.limit(lambda: _get_admin_rate_limit())
def integrity_run_now():
principal, error = _require_admin()
if error:
return error
checker = _integrity()
if not checker:
return _json_error("InvalidRequest", "Integrity checker is not enabled", 400)
payload = request.get_json(silent=True) or {}
override_dry_run = payload.get("dry_run")
override_auto_heal = payload.get("auto_heal")
started = checker.run_async(
auto_heal=override_auto_heal if override_auto_heal is not None else None,
dry_run=override_dry_run if override_dry_run is not None else None,
)
logger.info("Integrity manual run by %s", principal.access_key)
if not started:
return _json_error("Conflict", "A scan is already in progress", 409)
return jsonify({"status": "started"})
@admin_api_bp.route("/integrity/history", methods=["GET"])
@limiter.limit(lambda: _get_admin_rate_limit())
def integrity_history():
principal, error = _require_admin()
if error:
return error
checker = _integrity()
if not checker:
return jsonify({"executions": []})
limit = min(int(request.args.get("limit", 50)), 200)
offset = int(request.args.get("offset", 0))
records = checker.get_history(limit=limit, offset=offset)
return jsonify({"executions": records})

View File

@@ -1,27 +1,130 @@
"""Bucket policy loader/enforcer with a subset of AWS semantics."""
from __future__ import annotations
import ipaddress
import json
from dataclasses import dataclass
from fnmatch import fnmatch
import os
import re
import time
from dataclasses import dataclass, field
from fnmatch import fnmatch, translate
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence
from typing import Any, Dict, Iterable, List, Optional, Pattern, Sequence, Tuple
RESOURCE_PREFIX = "arn:aws:s3:::"
@lru_cache(maxsize=256)
def _compile_pattern(pattern: str) -> Pattern[str]:
return re.compile(translate(pattern), re.IGNORECASE)
def _match_string_like(value: str, pattern: str) -> bool:
compiled = _compile_pattern(pattern)
return bool(compiled.match(value))
def _ip_in_cidr(ip_str: str, cidr: str) -> bool:
try:
ip = ipaddress.ip_address(ip_str)
network = ipaddress.ip_network(cidr, strict=False)
return ip in network
except ValueError:
return False
def _evaluate_condition_operator(
operator: str,
condition_key: str,
condition_values: List[str],
context: Dict[str, Any],
) -> bool:
context_value = context.get(condition_key)
op_lower = operator.lower()
if_exists = op_lower.endswith("ifexists")
if if_exists:
op_lower = op_lower[:-8]
if context_value is None:
return if_exists
context_value_str = str(context_value)
context_value_lower = context_value_str.lower()
if op_lower == "stringequals":
return context_value_str in condition_values
elif op_lower == "stringnotequals":
return context_value_str not in condition_values
elif op_lower == "stringequalsignorecase":
return context_value_lower in [v.lower() for v in condition_values]
elif op_lower == "stringnotequalsignorecase":
return context_value_lower not in [v.lower() for v in condition_values]
elif op_lower == "stringlike":
return any(_match_string_like(context_value_str, p) for p in condition_values)
elif op_lower == "stringnotlike":
return not any(_match_string_like(context_value_str, p) for p in condition_values)
elif op_lower == "ipaddress":
return any(_ip_in_cidr(context_value_str, cidr) for cidr in condition_values)
elif op_lower == "notipaddress":
return not any(_ip_in_cidr(context_value_str, cidr) for cidr in condition_values)
elif op_lower == "bool":
bool_val = context_value_lower in ("true", "1", "yes")
return str(bool_val).lower() in [v.lower() for v in condition_values]
elif op_lower == "null":
is_null = context_value is None or context_value == ""
expected_null = condition_values[0].lower() in ("true", "1", "yes") if condition_values else True
return is_null == expected_null
return False
ACTION_ALIASES = {
"s3:getobject": "read",
"s3:getobjectversion": "read",
"s3:listbucket": "list",
"s3:listallmybuckets": "list",
"s3:listbucketversions": "list",
"s3:listmultipartuploads": "list",
"s3:listparts": "list",
"s3:getobject": "read",
"s3:getobjectversion": "read",
"s3:getobjecttagging": "read",
"s3:getobjectversiontagging": "read",
"s3:getobjectacl": "read",
"s3:getbucketversioning": "read",
"s3:headobject": "read",
"s3:headbucket": "read",
"s3:putobject": "write",
"s3:createbucket": "write",
"s3:putobjecttagging": "write",
"s3:putbucketversioning": "write",
"s3:createmultipartupload": "write",
"s3:uploadpart": "write",
"s3:completemultipartupload": "write",
"s3:abortmultipartupload": "write",
"s3:copyobject": "write",
"s3:deleteobject": "delete",
"s3:deleteobjectversion": "delete",
"s3:deletebucket": "delete",
"s3:deleteobjecttagging": "delete",
"s3:putobjectacl": "share",
"s3:putbucketacl": "share",
"s3:getbucketacl": "share",
"s3:putbucketpolicy": "policy",
"s3:getbucketpolicy": "policy",
"s3:deletebucketpolicy": "policy",
"s3:getreplicationconfiguration": "replication",
"s3:putreplicationconfiguration": "replication",
"s3:deletereplicationconfiguration": "replication",
"s3:replicateobject": "replication",
"s3:replicatetags": "replication",
"s3:replicatedelete": "replication",
"s3:getlifecycleconfiguration": "lifecycle",
"s3:putlifecycleconfiguration": "lifecycle",
"s3:deletelifecycleconfiguration": "lifecycle",
"s3:getbucketlifecycle": "lifecycle",
"s3:putbucketlifecycle": "lifecycle",
"s3:getbucketcors": "cors",
"s3:putbucketcors": "cors",
"s3:deletebucketcors": "cors",
}
@@ -99,7 +202,20 @@ class BucketPolicyStatement:
effect: str
principals: List[str] | str
actions: List[str]
resources: List[tuple[str | None, str | None]]
resources: List[Tuple[str | None, str | None]]
conditions: Dict[str, Dict[str, List[str]]] = field(default_factory=dict)
_compiled_patterns: List[Tuple[str | None, Optional[Pattern[str]]]] | None = None
def _get_compiled_patterns(self) -> List[Tuple[str | None, Optional[Pattern[str]]]]:
if self._compiled_patterns is None:
self._compiled_patterns = []
for resource_bucket, key_pattern in self.resources:
if key_pattern is None:
self._compiled_patterns.append((resource_bucket, None))
else:
regex_pattern = translate(key_pattern)
self._compiled_patterns.append((resource_bucket, re.compile(regex_pattern)))
return self._compiled_patterns
def matches_principal(self, access_key: Optional[str]) -> bool:
if self.principals == "*":
@@ -115,18 +231,29 @@ class BucketPolicyStatement:
def matches_resource(self, bucket: Optional[str], object_key: Optional[str]) -> bool:
bucket = (bucket or "*").lower()
key = object_key or ""
for resource_bucket, key_pattern in self.resources:
for resource_bucket, compiled_pattern in self._get_compiled_patterns():
resource_bucket = (resource_bucket or "*").lower()
if resource_bucket not in {"*", bucket}:
continue
if key_pattern is None:
if compiled_pattern is None:
if not key:
return True
continue
if fnmatch(key, key_pattern):
if compiled_pattern.match(key):
return True
return False
def matches_condition(self, context: Optional[Dict[str, Any]]) -> bool:
if not self.conditions:
return True
if context is None:
context = {}
for operator, key_values in self.conditions.items():
for condition_key, condition_values in key_values.items():
if not _evaluate_condition_operator(operator, condition_key, condition_values, context):
return False
return True
class BucketPolicyStore:
"""Loads bucket policies from disk and evaluates statements."""
@@ -140,8 +267,16 @@ class BucketPolicyStore:
self._policies: Dict[str, List[BucketPolicyStatement]] = {}
self._load()
self._last_mtime = self._current_mtime()
# Performance: Avoid stat() on every request
self._last_stat_check = 0.0
self._stat_check_interval = float(os.environ.get("BUCKET_POLICY_STAT_CHECK_INTERVAL_SECONDS", "2.0"))
def maybe_reload(self) -> None:
# Performance: Skip stat check if we checked recently
now = time.time()
if now - self._last_stat_check < self._stat_check_interval:
return
self._last_stat_check = now
current = self._current_mtime()
if current is None or current == self._last_mtime:
return
@@ -154,13 +289,13 @@ class BucketPolicyStore:
except FileNotFoundError:
return None
# ------------------------------------------------------------------
def evaluate(
self,
access_key: Optional[str],
bucket: Optional[str],
object_key: Optional[str],
action: str,
context: Optional[Dict[str, Any]] = None,
) -> str | None:
bucket = (bucket or "").lower()
statements = self._policies.get(bucket) or []
@@ -172,6 +307,8 @@ class BucketPolicyStore:
continue
if not statement.matches_resource(bucket, object_key):
continue
if not statement.matches_condition(context):
continue
if statement.effect == "deny":
return "deny"
decision = "allow"
@@ -195,7 +332,6 @@ class BucketPolicyStore:
self._policies.pop(bucket, None)
self._persist()
# ------------------------------------------------------------------
def _load(self) -> None:
try:
content = self.policy_path.read_text(encoding='utf-8')
@@ -237,6 +373,7 @@ class BucketPolicyStore:
if not resources:
continue
effect = statement.get("Effect", "Allow").lower()
conditions = self._normalize_conditions(statement.get("Condition", {}))
statements.append(
BucketPolicyStatement(
sid=statement.get("Sid"),
@@ -244,6 +381,24 @@ class BucketPolicyStore:
principals=principals,
actions=actions or ["*"],
resources=resources,
conditions=conditions,
)
)
return statements
def _normalize_conditions(self, condition_block: Dict[str, Any]) -> Dict[str, Dict[str, List[str]]]:
if not condition_block or not isinstance(condition_block, dict):
return {}
normalized: Dict[str, Dict[str, List[str]]] = {}
for operator, key_values in condition_block.items():
if not isinstance(key_values, dict):
continue
normalized[operator] = {}
for cond_key, cond_values in key_values.items():
if isinstance(cond_values, str):
normalized[operator][cond_key] = [cond_values]
elif isinstance(cond_values, list):
normalized[operator][cond_key] = [str(v) for v in cond_values]
else:
normalized[operator][cond_key] = [str(cond_values)]
return normalized

109
app/compression.py Normal file
View File

@@ -0,0 +1,109 @@
from __future__ import annotations
import gzip
import io
from typing import Callable, Iterable, List, Tuple
COMPRESSIBLE_MIMES = frozenset([
'application/json',
'application/javascript',
'application/xml',
'text/html',
'text/css',
'text/plain',
'text/xml',
'text/javascript',
'application/x-ndjson',
])
MIN_SIZE_FOR_COMPRESSION = 500
class GzipMiddleware:
def __init__(self, app: Callable, compression_level: int = 6, min_size: int = MIN_SIZE_FOR_COMPRESSION):
self.app = app
self.compression_level = compression_level
self.min_size = min_size
def __call__(self, environ: dict, start_response: Callable) -> Iterable[bytes]:
accept_encoding = environ.get('HTTP_ACCEPT_ENCODING', '')
if 'gzip' not in accept_encoding.lower():
return self.app(environ, start_response)
response_started = False
status_code = None
response_headers: List[Tuple[str, str]] = []
content_type = None
content_length = None
should_compress = False
passthrough = False
exc_info_holder = [None]
def custom_start_response(status: str, headers: List[Tuple[str, str]], exc_info=None):
nonlocal response_started, status_code, response_headers, content_type, content_length, should_compress, passthrough
response_started = True
status_code = int(status.split(' ', 1)[0])
response_headers = list(headers)
exc_info_holder[0] = exc_info
for name, value in headers:
name_lower = name.lower()
if name_lower == 'content-type':
content_type = value.split(';')[0].strip().lower()
elif name_lower == 'content-length':
try:
content_length = int(value)
except (ValueError, TypeError):
pass
elif name_lower == 'content-encoding':
passthrough = True
return start_response(status, headers, exc_info)
elif name_lower == 'x-stream-response':
passthrough = True
return start_response(status, headers, exc_info)
if content_type and content_type in COMPRESSIBLE_MIMES:
if content_length is None or content_length >= self.min_size:
should_compress = True
else:
passthrough = True
return start_response(status, headers, exc_info)
return None
app_iter = self.app(environ, custom_start_response)
if passthrough:
return app_iter
response_body = b''.join(app_iter)
if not response_started:
return [response_body]
if should_compress and len(response_body) >= self.min_size:
buf = io.BytesIO()
with gzip.GzipFile(fileobj=buf, mode='wb', compresslevel=self.compression_level) as gz:
gz.write(response_body)
compressed = buf.getvalue()
if len(compressed) < len(response_body):
response_body = compressed
new_headers = []
for name, value in response_headers:
if name.lower() not in ('content-length', 'content-encoding'):
new_headers.append((name, value))
new_headers.append(('Content-Encoding', 'gzip'))
new_headers.append(('Content-Length', str(len(response_body))))
new_headers.append(('Vary', 'Accept-Encoding'))
response_headers = new_headers
status_str = f"{status_code} " + {
200: "OK", 201: "Created", 204: "No Content", 206: "Partial Content",
301: "Moved Permanently", 302: "Found", 304: "Not Modified",
400: "Bad Request", 401: "Unauthorized", 403: "Forbidden", 404: "Not Found",
405: "Method Not Allowed", 409: "Conflict", 500: "Internal Server Error",
}.get(status_code, "Unknown")
start_response(status_str, response_headers, exc_info_holder[0])
return [response_body]

View File

@@ -1,15 +1,45 @@
"""Configuration helpers for the S3 clone application."""
from __future__ import annotations
import os
import re
import secrets
import shutil
import sys
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional
PROJECT_ROOT = Path(__file__).resolve().parent.parent
import psutil
def _calculate_auto_threads() -> int:
cpu_count = psutil.cpu_count(logical=True) or 4
return max(1, min(cpu_count * 2, 64))
def _calculate_auto_connection_limit() -> int:
available_mb = psutil.virtual_memory().available / (1024 * 1024)
calculated = int(available_mb / 5)
return max(20, min(calculated, 1000))
def _calculate_auto_backlog(connection_limit: int) -> int:
return max(128, min(connection_limit * 2, 4096))
def _validate_rate_limit(value: str) -> str:
pattern = r"^\d+\s+per\s+(second|minute|hour|day)$"
if not re.match(pattern, value):
raise ValueError(f"Invalid rate limit format: {value}. Expected format: '200 per minute'")
return value
if getattr(sys, "frozen", False):
# Running in a PyInstaller bundle
PROJECT_ROOT = Path(sys._MEIPASS)
else:
# Running in a normal Python environment
PROJECT_ROOT = Path(__file__).resolve().parent.parent
def _prepare_config_file(active_path: Path, legacy_path: Optional[Path] = None) -> Path:
@@ -39,19 +69,25 @@ class AppConfig:
secret_key: str
iam_config_path: Path
bucket_policy_path: Path
api_base_url: str
api_base_url: Optional[str]
aws_region: str
aws_service: str
ui_enforce_bucket_policies: bool
log_level: str
log_to_file: bool
log_path: Path
log_max_bytes: int
log_backup_count: int
ratelimit_default: str
ratelimit_storage_uri: str
ratelimit_list_buckets: str
ratelimit_bucket_ops: str
ratelimit_object_ops: str
ratelimit_head_ops: str
cors_origins: list[str]
cors_methods: list[str]
cors_allow_headers: list[str]
cors_expose_headers: list[str]
session_lifetime_days: int
auth_max_attempts: int
auth_lockout_minutes: int
@@ -59,6 +95,76 @@ class AppConfig:
secret_ttl_seconds: int
stream_chunk_size: int
multipart_min_part_size: int
bucket_stats_cache_ttl: int
object_cache_ttl: int
encryption_enabled: bool
encryption_master_key_path: Path
kms_enabled: bool
kms_keys_path: Path
default_encryption_algorithm: str
display_timezone: str
lifecycle_enabled: bool
lifecycle_interval_seconds: int
metrics_history_enabled: bool
metrics_history_retention_hours: int
metrics_history_interval_minutes: int
operation_metrics_enabled: bool
operation_metrics_interval_minutes: int
operation_metrics_retention_hours: int
server_threads: int
server_connection_limit: int
server_backlog: int
server_channel_timeout: int
server_max_buffer_size: int
server_threads_auto: bool
server_connection_limit_auto: bool
server_backlog_auto: bool
site_sync_enabled: bool
site_sync_interval_seconds: int
site_sync_batch_size: int
sigv4_timestamp_tolerance_seconds: int
presigned_url_min_expiry_seconds: int
presigned_url_max_expiry_seconds: int
replication_connect_timeout_seconds: int
replication_read_timeout_seconds: int
replication_max_retries: int
replication_streaming_threshold_bytes: int
replication_max_failures_per_bucket: int
site_sync_connect_timeout_seconds: int
site_sync_read_timeout_seconds: int
site_sync_max_retries: int
site_sync_clock_skew_tolerance_seconds: float
object_key_max_length_bytes: int
object_cache_max_size: int
meta_read_cache_max: int
bucket_config_cache_ttl_seconds: float
object_tag_limit: int
encryption_chunk_size_bytes: int
kms_generate_data_key_min_bytes: int
kms_generate_data_key_max_bytes: int
lifecycle_max_history_per_bucket: int
site_id: Optional[str]
site_endpoint: Optional[str]
site_region: str
site_priority: int
ratelimit_admin: str
num_trusted_proxies: int
allowed_redirect_hosts: list[str]
allow_internal_endpoints: bool
website_hosting_enabled: bool
gc_enabled: bool
gc_interval_hours: float
gc_temp_file_max_age_hours: float
gc_multipart_max_age_days: int
gc_lock_file_max_age_hours: float
gc_dry_run: bool
gc_io_throttle_ms: int
integrity_enabled: bool
integrity_interval_hours: float
integrity_batch_size: int
integrity_auto_heal: bool
integrity_dry_run: bool
integrity_io_throttle_ms: int
@classmethod
def from_env(cls, overrides: Optional[Dict[str, Any]] = None) -> "AppConfig":
@@ -68,7 +174,7 @@ class AppConfig:
return overrides.get(name, os.getenv(name, default))
storage_root = Path(_get("STORAGE_ROOT", PROJECT_ROOT / "data")).resolve()
max_upload_size = int(_get("MAX_UPLOAD_SIZE", 1024 * 1024 * 1024)) # 1 GiB default
max_upload_size = int(_get("MAX_UPLOAD_SIZE", 1024 * 1024 * 1024))
ui_page_size = int(_get("UI_PAGE_SIZE", 100))
auth_max_attempts = int(_get("AUTH_MAX_ATTEMPTS", 5))
auth_lockout_minutes = int(_get("AUTH_LOCKOUT_MINUTES", 15))
@@ -76,42 +182,67 @@ class AppConfig:
secret_ttl_seconds = int(_get("SECRET_TTL_SECONDS", 300))
stream_chunk_size = int(_get("STREAM_CHUNK_SIZE", 64 * 1024))
multipart_min_part_size = int(_get("MULTIPART_MIN_PART_SIZE", 5 * 1024 * 1024))
lifecycle_enabled = _get("LIFECYCLE_ENABLED", "false").lower() in ("true", "1", "yes")
lifecycle_interval_seconds = int(_get("LIFECYCLE_INTERVAL_SECONDS", 3600))
default_secret = "dev-secret-key"
secret_key = str(_get("SECRET_KEY", default_secret))
if not secret_key or secret_key == default_secret:
secret_file = storage_root / ".myfsio.sys" / "config" / ".secret"
if secret_file.exists():
secret_key = secret_file.read_text().strip()
else:
generated = secrets.token_urlsafe(32)
if secret_key == default_secret:
warnings.warn("Using insecure default SECRET_KEY. A random value has been generated; set SECRET_KEY for production", RuntimeWarning)
warnings.warn("Using insecure default SECRET_KEY. A random value has been generated and persisted; set SECRET_KEY for production", RuntimeWarning)
try:
secret_file.parent.mkdir(parents=True, exist_ok=True)
secret_file.write_text(generated)
try:
os.chmod(secret_file, 0o600)
except OSError:
pass
secret_key = generated
except OSError:
secret_key = generated
iam_env_override = "IAM_CONFIG" in overrides or "IAM_CONFIG" in os.environ
bucket_policy_override = "BUCKET_POLICY_PATH" in overrides or "BUCKET_POLICY_PATH" in os.environ
default_iam_path = PROJECT_ROOT / "data" / ".myfsio.sys" / "config" / "iam.json"
default_bucket_policy_path = PROJECT_ROOT / "data" / ".myfsio.sys" / "config" / "bucket_policies.json"
default_iam_path = storage_root / ".myfsio.sys" / "config" / "iam.json"
default_bucket_policy_path = storage_root / ".myfsio.sys" / "config" / "bucket_policies.json"
iam_config_path = Path(_get("IAM_CONFIG", default_iam_path)).resolve()
bucket_policy_path = Path(_get("BUCKET_POLICY_PATH", default_bucket_policy_path)).resolve()
iam_config_path = _prepare_config_file(
iam_config_path,
legacy_path=None if iam_env_override else PROJECT_ROOT / "data" / "iam.json",
legacy_path=None if iam_env_override else storage_root / "iam.json",
)
bucket_policy_path = _prepare_config_file(
bucket_policy_path,
legacy_path=None if bucket_policy_override else PROJECT_ROOT / "data" / "bucket_policies.json",
legacy_path=None if bucket_policy_override else storage_root / "bucket_policies.json",
)
api_base_url = str(_get("API_BASE_URL", "http://127.0.0.1:5000"))
api_base_url = _get("API_BASE_URL", None)
if api_base_url:
api_base_url = str(api_base_url)
aws_region = str(_get("AWS_REGION", "us-east-1"))
aws_service = str(_get("AWS_SERVICE", "s3"))
enforce_ui_policies = str(_get("UI_ENFORCE_BUCKET_POLICIES", "0")).lower() in {"1", "true", "yes", "on"}
log_level = str(_get("LOG_LEVEL", "INFO")).upper()
log_dir = Path(_get("LOG_DIR", PROJECT_ROOT / "logs")).resolve()
log_to_file = str(_get("LOG_TO_FILE", "1")).lower() in {"1", "true", "yes", "on"}
log_dir = Path(_get("LOG_DIR", storage_root.parent / "logs")).resolve()
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / str(_get("LOG_FILE", "app.log"))
log_max_bytes = int(_get("LOG_MAX_BYTES", 5 * 1024 * 1024))
log_backup_count = int(_get("LOG_BACKUP_COUNT", 3))
ratelimit_default = str(_get("RATE_LIMIT_DEFAULT", "200 per minute"))
ratelimit_default = _validate_rate_limit(str(_get("RATE_LIMIT_DEFAULT", "200 per minute")))
ratelimit_storage_uri = str(_get("RATE_LIMIT_STORAGE_URI", "memory://"))
ratelimit_list_buckets = _validate_rate_limit(str(_get("RATE_LIMIT_LIST_BUCKETS", "60 per minute")))
ratelimit_bucket_ops = _validate_rate_limit(str(_get("RATE_LIMIT_BUCKET_OPS", "120 per minute")))
ratelimit_object_ops = _validate_rate_limit(str(_get("RATE_LIMIT_OBJECT_OPS", "240 per minute")))
ratelimit_head_ops = _validate_rate_limit(str(_get("RATE_LIMIT_HEAD_OPS", "100 per minute")))
def _csv(value: str, default: list[str]) -> list[str]:
if not value:
@@ -120,19 +251,104 @@ class AppConfig:
return parts or default
cors_origins = _csv(str(_get("CORS_ORIGINS", "*")), ["*"])
cors_methods = _csv(str(_get("CORS_METHODS", "GET,PUT,POST,DELETE,OPTIONS")), ["GET", "PUT", "POST", "DELETE", "OPTIONS"])
cors_allow_headers = _csv(str(_get("CORS_ALLOW_HEADERS", "Content-Type,X-Access-Key,X-Secret-Key,X-Amz-Algorithm,X-Amz-Credential,X-Amz-Date,X-Amz-Expires,X-Amz-SignedHeaders,X-Amz-Signature")), [
"Content-Type",
"X-Access-Key",
"X-Secret-Key",
"X-Amz-Algorithm",
"X-Amz-Credential",
"X-Amz-Date",
"X-Amz-Expires",
"X-Amz-SignedHeaders",
"X-Amz-Signature",
])
cors_methods = _csv(str(_get("CORS_METHODS", "GET,PUT,POST,DELETE,OPTIONS,HEAD")), ["GET", "PUT", "POST", "DELETE", "OPTIONS", "HEAD"])
cors_allow_headers = _csv(str(_get("CORS_ALLOW_HEADERS", "*")), ["*"])
cors_expose_headers = _csv(str(_get("CORS_EXPOSE_HEADERS", "*")), ["*"])
session_lifetime_days = int(_get("SESSION_LIFETIME_DAYS", 30))
bucket_stats_cache_ttl = int(_get("BUCKET_STATS_CACHE_TTL", 60))
object_cache_ttl = int(_get("OBJECT_CACHE_TTL", 60))
encryption_enabled = str(_get("ENCRYPTION_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
encryption_keys_dir = storage_root / ".myfsio.sys" / "keys"
encryption_master_key_path = Path(_get("ENCRYPTION_MASTER_KEY_PATH", encryption_keys_dir / "master.key")).resolve()
kms_enabled = str(_get("KMS_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
kms_keys_path = Path(_get("KMS_KEYS_PATH", encryption_keys_dir / "kms_keys.json")).resolve()
default_encryption_algorithm = str(_get("DEFAULT_ENCRYPTION_ALGORITHM", "AES256"))
display_timezone = str(_get("DISPLAY_TIMEZONE", "UTC"))
metrics_history_enabled = str(_get("METRICS_HISTORY_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
metrics_history_retention_hours = int(_get("METRICS_HISTORY_RETENTION_HOURS", 24))
metrics_history_interval_minutes = int(_get("METRICS_HISTORY_INTERVAL_MINUTES", 5))
operation_metrics_enabled = str(_get("OPERATION_METRICS_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
operation_metrics_interval_minutes = int(_get("OPERATION_METRICS_INTERVAL_MINUTES", 5))
operation_metrics_retention_hours = int(_get("OPERATION_METRICS_RETENTION_HOURS", 24))
_raw_threads = int(_get("SERVER_THREADS", 0))
if _raw_threads == 0:
server_threads = _calculate_auto_threads()
server_threads_auto = True
else:
server_threads = _raw_threads
server_threads_auto = False
_raw_conn_limit = int(_get("SERVER_CONNECTION_LIMIT", 0))
if _raw_conn_limit == 0:
server_connection_limit = _calculate_auto_connection_limit()
server_connection_limit_auto = True
else:
server_connection_limit = _raw_conn_limit
server_connection_limit_auto = False
_raw_backlog = int(_get("SERVER_BACKLOG", 0))
if _raw_backlog == 0:
server_backlog = _calculate_auto_backlog(server_connection_limit)
server_backlog_auto = True
else:
server_backlog = _raw_backlog
server_backlog_auto = False
server_channel_timeout = int(_get("SERVER_CHANNEL_TIMEOUT", 120))
server_max_buffer_size = int(_get("SERVER_MAX_BUFFER_SIZE", 1024 * 1024 * 128))
site_sync_enabled = str(_get("SITE_SYNC_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
site_sync_interval_seconds = int(_get("SITE_SYNC_INTERVAL_SECONDS", 60))
site_sync_batch_size = int(_get("SITE_SYNC_BATCH_SIZE", 100))
sigv4_timestamp_tolerance_seconds = int(_get("SIGV4_TIMESTAMP_TOLERANCE_SECONDS", 900))
presigned_url_min_expiry_seconds = int(_get("PRESIGNED_URL_MIN_EXPIRY_SECONDS", 1))
presigned_url_max_expiry_seconds = int(_get("PRESIGNED_URL_MAX_EXPIRY_SECONDS", 604800))
replication_connect_timeout_seconds = int(_get("REPLICATION_CONNECT_TIMEOUT_SECONDS", 5))
replication_read_timeout_seconds = int(_get("REPLICATION_READ_TIMEOUT_SECONDS", 30))
replication_max_retries = int(_get("REPLICATION_MAX_RETRIES", 2))
replication_streaming_threshold_bytes = int(_get("REPLICATION_STREAMING_THRESHOLD_BYTES", 10 * 1024 * 1024))
replication_max_failures_per_bucket = int(_get("REPLICATION_MAX_FAILURES_PER_BUCKET", 50))
site_sync_connect_timeout_seconds = int(_get("SITE_SYNC_CONNECT_TIMEOUT_SECONDS", 10))
site_sync_read_timeout_seconds = int(_get("SITE_SYNC_READ_TIMEOUT_SECONDS", 120))
site_sync_max_retries = int(_get("SITE_SYNC_MAX_RETRIES", 2))
site_sync_clock_skew_tolerance_seconds = float(_get("SITE_SYNC_CLOCK_SKEW_TOLERANCE_SECONDS", 1.0))
object_key_max_length_bytes = int(_get("OBJECT_KEY_MAX_LENGTH_BYTES", 1024))
object_cache_max_size = int(_get("OBJECT_CACHE_MAX_SIZE", 100))
meta_read_cache_max = int(_get("META_READ_CACHE_MAX", 2048))
bucket_config_cache_ttl_seconds = float(_get("BUCKET_CONFIG_CACHE_TTL_SECONDS", 30.0))
object_tag_limit = int(_get("OBJECT_TAG_LIMIT", 50))
encryption_chunk_size_bytes = int(_get("ENCRYPTION_CHUNK_SIZE_BYTES", 64 * 1024))
kms_generate_data_key_min_bytes = int(_get("KMS_GENERATE_DATA_KEY_MIN_BYTES", 1))
kms_generate_data_key_max_bytes = int(_get("KMS_GENERATE_DATA_KEY_MAX_BYTES", 1024))
lifecycle_max_history_per_bucket = int(_get("LIFECYCLE_MAX_HISTORY_PER_BUCKET", 50))
site_id_raw = _get("SITE_ID", None)
site_id = str(site_id_raw).strip() if site_id_raw else None
site_endpoint_raw = _get("SITE_ENDPOINT", None)
site_endpoint = str(site_endpoint_raw).strip() if site_endpoint_raw else None
site_region = str(_get("SITE_REGION", "us-east-1"))
site_priority = int(_get("SITE_PRIORITY", 100))
ratelimit_admin = _validate_rate_limit(str(_get("RATE_LIMIT_ADMIN", "60 per minute")))
num_trusted_proxies = int(_get("NUM_TRUSTED_PROXIES", 1))
allowed_redirect_hosts_raw = _get("ALLOWED_REDIRECT_HOSTS", "")
allowed_redirect_hosts = [h.strip() for h in str(allowed_redirect_hosts_raw).split(",") if h.strip()]
allow_internal_endpoints = str(_get("ALLOW_INTERNAL_ENDPOINTS", "0")).lower() in {"1", "true", "yes", "on"}
website_hosting_enabled = str(_get("WEBSITE_HOSTING_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
gc_enabled = str(_get("GC_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
gc_interval_hours = float(_get("GC_INTERVAL_HOURS", 6.0))
gc_temp_file_max_age_hours = float(_get("GC_TEMP_FILE_MAX_AGE_HOURS", 24.0))
gc_multipart_max_age_days = int(_get("GC_MULTIPART_MAX_AGE_DAYS", 7))
gc_lock_file_max_age_hours = float(_get("GC_LOCK_FILE_MAX_AGE_HOURS", 1.0))
gc_dry_run = str(_get("GC_DRY_RUN", "0")).lower() in {"1", "true", "yes", "on"}
gc_io_throttle_ms = int(_get("GC_IO_THROTTLE_MS", 10))
integrity_enabled = str(_get("INTEGRITY_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
integrity_interval_hours = float(_get("INTEGRITY_INTERVAL_HOURS", 24.0))
integrity_batch_size = int(_get("INTEGRITY_BATCH_SIZE", 1000))
integrity_auto_heal = str(_get("INTEGRITY_AUTO_HEAL", "0")).lower() in {"1", "true", "yes", "on"}
integrity_dry_run = str(_get("INTEGRITY_DRY_RUN", "0")).lower() in {"1", "true", "yes", "on"}
integrity_io_throttle_ms = int(_get("INTEGRITY_IO_THROTTLE_MS", 10))
return cls(storage_root=storage_root,
max_upload_size=max_upload_size,
@@ -145,21 +361,223 @@ class AppConfig:
aws_service=aws_service,
ui_enforce_bucket_policies=enforce_ui_policies,
log_level=log_level,
log_to_file=log_to_file,
log_path=log_path,
log_max_bytes=log_max_bytes,
log_backup_count=log_backup_count,
ratelimit_default=ratelimit_default,
ratelimit_storage_uri=ratelimit_storage_uri,
ratelimit_list_buckets=ratelimit_list_buckets,
ratelimit_bucket_ops=ratelimit_bucket_ops,
ratelimit_object_ops=ratelimit_object_ops,
ratelimit_head_ops=ratelimit_head_ops,
cors_origins=cors_origins,
cors_methods=cors_methods,
cors_allow_headers=cors_allow_headers,
cors_expose_headers=cors_expose_headers,
session_lifetime_days=session_lifetime_days,
auth_max_attempts=auth_max_attempts,
auth_lockout_minutes=auth_lockout_minutes,
bulk_delete_max_keys=bulk_delete_max_keys,
secret_ttl_seconds=secret_ttl_seconds,
stream_chunk_size=stream_chunk_size,
multipart_min_part_size=multipart_min_part_size)
multipart_min_part_size=multipart_min_part_size,
bucket_stats_cache_ttl=bucket_stats_cache_ttl,
object_cache_ttl=object_cache_ttl,
encryption_enabled=encryption_enabled,
encryption_master_key_path=encryption_master_key_path,
kms_enabled=kms_enabled,
kms_keys_path=kms_keys_path,
default_encryption_algorithm=default_encryption_algorithm,
display_timezone=display_timezone,
lifecycle_enabled=lifecycle_enabled,
lifecycle_interval_seconds=lifecycle_interval_seconds,
metrics_history_enabled=metrics_history_enabled,
metrics_history_retention_hours=metrics_history_retention_hours,
metrics_history_interval_minutes=metrics_history_interval_minutes,
operation_metrics_enabled=operation_metrics_enabled,
operation_metrics_interval_minutes=operation_metrics_interval_minutes,
operation_metrics_retention_hours=operation_metrics_retention_hours,
server_threads=server_threads,
server_connection_limit=server_connection_limit,
server_backlog=server_backlog,
server_channel_timeout=server_channel_timeout,
server_max_buffer_size=server_max_buffer_size,
server_threads_auto=server_threads_auto,
server_connection_limit_auto=server_connection_limit_auto,
server_backlog_auto=server_backlog_auto,
site_sync_enabled=site_sync_enabled,
site_sync_interval_seconds=site_sync_interval_seconds,
site_sync_batch_size=site_sync_batch_size,
sigv4_timestamp_tolerance_seconds=sigv4_timestamp_tolerance_seconds,
presigned_url_min_expiry_seconds=presigned_url_min_expiry_seconds,
presigned_url_max_expiry_seconds=presigned_url_max_expiry_seconds,
replication_connect_timeout_seconds=replication_connect_timeout_seconds,
replication_read_timeout_seconds=replication_read_timeout_seconds,
replication_max_retries=replication_max_retries,
replication_streaming_threshold_bytes=replication_streaming_threshold_bytes,
replication_max_failures_per_bucket=replication_max_failures_per_bucket,
site_sync_connect_timeout_seconds=site_sync_connect_timeout_seconds,
site_sync_read_timeout_seconds=site_sync_read_timeout_seconds,
site_sync_max_retries=site_sync_max_retries,
site_sync_clock_skew_tolerance_seconds=site_sync_clock_skew_tolerance_seconds,
object_key_max_length_bytes=object_key_max_length_bytes,
object_cache_max_size=object_cache_max_size,
meta_read_cache_max=meta_read_cache_max,
bucket_config_cache_ttl_seconds=bucket_config_cache_ttl_seconds,
object_tag_limit=object_tag_limit,
encryption_chunk_size_bytes=encryption_chunk_size_bytes,
kms_generate_data_key_min_bytes=kms_generate_data_key_min_bytes,
kms_generate_data_key_max_bytes=kms_generate_data_key_max_bytes,
lifecycle_max_history_per_bucket=lifecycle_max_history_per_bucket,
site_id=site_id,
site_endpoint=site_endpoint,
site_region=site_region,
site_priority=site_priority,
ratelimit_admin=ratelimit_admin,
num_trusted_proxies=num_trusted_proxies,
allowed_redirect_hosts=allowed_redirect_hosts,
allow_internal_endpoints=allow_internal_endpoints,
website_hosting_enabled=website_hosting_enabled,
gc_enabled=gc_enabled,
gc_interval_hours=gc_interval_hours,
gc_temp_file_max_age_hours=gc_temp_file_max_age_hours,
gc_multipart_max_age_days=gc_multipart_max_age_days,
gc_lock_file_max_age_hours=gc_lock_file_max_age_hours,
gc_dry_run=gc_dry_run,
gc_io_throttle_ms=gc_io_throttle_ms,
integrity_enabled=integrity_enabled,
integrity_interval_hours=integrity_interval_hours,
integrity_batch_size=integrity_batch_size,
integrity_auto_heal=integrity_auto_heal,
integrity_dry_run=integrity_dry_run,
integrity_io_throttle_ms=integrity_io_throttle_ms)
def validate_and_report(self) -> list[str]:
"""Validate configuration and return a list of warnings/issues.
Call this at startup to detect potential misconfigurations before
the application fully commits to running.
"""
issues = []
try:
test_file = self.storage_root / ".write_test"
test_file.touch()
test_file.unlink()
except (OSError, PermissionError) as e:
issues.append(f"CRITICAL: STORAGE_ROOT '{self.storage_root}' is not writable: {e}")
storage_str = str(self.storage_root).lower()
if "/tmp" in storage_str or "\\temp" in storage_str or "appdata\\local\\temp" in storage_str:
issues.append(f"WARNING: STORAGE_ROOT '{self.storage_root}' appears to be a temporary directory. Data may be lost on reboot!")
try:
self.iam_config_path.relative_to(self.storage_root)
except ValueError:
issues.append(f"WARNING: IAM_CONFIG '{self.iam_config_path}' is outside STORAGE_ROOT '{self.storage_root}'. Consider setting IAM_CONFIG explicitly or ensuring paths are aligned.")
try:
self.bucket_policy_path.relative_to(self.storage_root)
except ValueError:
issues.append(f"WARNING: BUCKET_POLICY_PATH '{self.bucket_policy_path}' is outside STORAGE_ROOT '{self.storage_root}'. Consider setting BUCKET_POLICY_PATH explicitly.")
try:
self.log_path.parent.mkdir(parents=True, exist_ok=True)
test_log = self.log_path.parent / ".write_test"
test_log.touch()
test_log.unlink()
except (OSError, PermissionError) as e:
issues.append(f"WARNING: Log directory '{self.log_path.parent}' is not writable: {e}")
log_str = str(self.log_path).lower()
if "/tmp" in log_str or "\\temp" in log_str or "appdata\\local\\temp" in log_str:
issues.append(f"WARNING: LOG_DIR '{self.log_path.parent}' appears to be a temporary directory. Logs may be lost on reboot!")
if self.encryption_enabled:
try:
self.encryption_master_key_path.relative_to(self.storage_root)
except ValueError:
issues.append(f"WARNING: ENCRYPTION_MASTER_KEY_PATH '{self.encryption_master_key_path}' is outside STORAGE_ROOT. Ensure proper backup procedures.")
if self.kms_enabled:
try:
self.kms_keys_path.relative_to(self.storage_root)
except ValueError:
issues.append(f"WARNING: KMS_KEYS_PATH '{self.kms_keys_path}' is outside STORAGE_ROOT. Ensure proper backup procedures.")
if self.secret_key == "dev-secret-key":
issues.append("WARNING: Using default SECRET_KEY. Set SECRET_KEY environment variable for production.")
if "*" in self.cors_origins:
issues.append("INFO: CORS_ORIGINS is set to '*'. Consider restricting to specific domains in production.")
if not (1 <= self.server_threads <= 64):
issues.append(f"CRITICAL: SERVER_THREADS={self.server_threads} is outside valid range (1-64). Server cannot start.")
if not (10 <= self.server_connection_limit <= 1000):
issues.append(f"CRITICAL: SERVER_CONNECTION_LIMIT={self.server_connection_limit} is outside valid range (10-1000). Server cannot start.")
if not (128 <= self.server_backlog <= 4096):
issues.append(f"CRITICAL: SERVER_BACKLOG={self.server_backlog} is outside valid range (128-4096). Server cannot start.")
if not (10 <= self.server_channel_timeout <= 300):
issues.append(f"CRITICAL: SERVER_CHANNEL_TIMEOUT={self.server_channel_timeout} is outside valid range (10-300). Server cannot start.")
if self.server_max_buffer_size < 1024 * 1024:
issues.append(f"WARNING: SERVER_MAX_BUFFER_SIZE={self.server_max_buffer_size} is less than 1MB. Large uploads will fail.")
if sys.platform != "win32":
try:
import resource
soft_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
threshold = int(soft_limit * 0.8)
if self.server_connection_limit > threshold:
issues.append(f"WARNING: SERVER_CONNECTION_LIMIT={self.server_connection_limit} exceeds 80% of system file descriptor limit (soft={soft_limit}). Consider running 'ulimit -n {self.server_connection_limit + 100}'.")
except (ImportError, OSError):
pass
try:
import psutil
available_mb = psutil.virtual_memory().available / (1024 * 1024)
estimated_mb = self.server_threads * 50
if estimated_mb > available_mb * 0.5:
issues.append(f"WARNING: SERVER_THREADS={self.server_threads} may require ~{estimated_mb}MB memory, exceeding 50% of available RAM ({int(available_mb)}MB).")
except ImportError:
pass
return issues
def print_startup_summary(self) -> None:
"""Print a summary of the configuration at startup."""
print("\n" + "=" * 60)
print("MyFSIO Configuration Summary")
print("=" * 60)
print(f" STORAGE_ROOT: {self.storage_root}")
print(f" IAM_CONFIG: {self.iam_config_path}")
print(f" BUCKET_POLICY: {self.bucket_policy_path}")
print(f" LOG_PATH: {self.log_path}")
if self.api_base_url:
print(f" API_BASE_URL: {self.api_base_url}")
if self.encryption_enabled:
print(f" ENCRYPTION: Enabled (Master key: {self.encryption_master_key_path})")
if self.kms_enabled:
print(f" KMS: Enabled (Keys: {self.kms_keys_path})")
if self.website_hosting_enabled:
print(f" WEBSITE_HOSTING: Enabled")
def _auto(flag: bool) -> str:
return " (auto)" if flag else ""
print(f" SERVER_THREADS: {self.server_threads}{_auto(self.server_threads_auto)}")
print(f" CONNECTION_LIMIT: {self.server_connection_limit}{_auto(self.server_connection_limit_auto)}")
print(f" BACKLOG: {self.server_backlog}{_auto(self.server_backlog_auto)}")
print(f" CHANNEL_TIMEOUT: {self.server_channel_timeout}s")
print(f" MAX_BUFFER_SIZE: {self.server_max_buffer_size // (1024 * 1024)}MB")
print("=" * 60)
issues = self.validate_and_report()
if issues:
print("\nConfiguration Issues Detected:")
for issue in issues:
print(f"{issue}")
print()
else:
print(" ✓ Configuration validated successfully\n")
def to_flask_config(self) -> Dict[str, Any]:
return {
@@ -179,14 +597,87 @@ class AppConfig:
"SECRET_TTL_SECONDS": self.secret_ttl_seconds,
"STREAM_CHUNK_SIZE": self.stream_chunk_size,
"MULTIPART_MIN_PART_SIZE": self.multipart_min_part_size,
"BUCKET_STATS_CACHE_TTL": self.bucket_stats_cache_ttl,
"OBJECT_CACHE_TTL": self.object_cache_ttl,
"LOG_LEVEL": self.log_level,
"LOG_TO_FILE": self.log_to_file,
"LOG_FILE": str(self.log_path),
"LOG_MAX_BYTES": self.log_max_bytes,
"LOG_BACKUP_COUNT": self.log_backup_count,
"RATELIMIT_DEFAULT": self.ratelimit_default,
"RATELIMIT_STORAGE_URI": self.ratelimit_storage_uri,
"RATELIMIT_LIST_BUCKETS": self.ratelimit_list_buckets,
"RATELIMIT_BUCKET_OPS": self.ratelimit_bucket_ops,
"RATELIMIT_OBJECT_OPS": self.ratelimit_object_ops,
"RATELIMIT_HEAD_OPS": self.ratelimit_head_ops,
"CORS_ORIGINS": self.cors_origins,
"CORS_METHODS": self.cors_methods,
"CORS_ALLOW_HEADERS": self.cors_allow_headers,
"CORS_EXPOSE_HEADERS": self.cors_expose_headers,
"SESSION_LIFETIME_DAYS": self.session_lifetime_days,
"ENCRYPTION_ENABLED": self.encryption_enabled,
"ENCRYPTION_MASTER_KEY_PATH": str(self.encryption_master_key_path),
"KMS_ENABLED": self.kms_enabled,
"KMS_KEYS_PATH": str(self.kms_keys_path),
"DEFAULT_ENCRYPTION_ALGORITHM": self.default_encryption_algorithm,
"DISPLAY_TIMEZONE": self.display_timezone,
"LIFECYCLE_ENABLED": self.lifecycle_enabled,
"LIFECYCLE_INTERVAL_SECONDS": self.lifecycle_interval_seconds,
"METRICS_HISTORY_ENABLED": self.metrics_history_enabled,
"METRICS_HISTORY_RETENTION_HOURS": self.metrics_history_retention_hours,
"METRICS_HISTORY_INTERVAL_MINUTES": self.metrics_history_interval_minutes,
"OPERATION_METRICS_ENABLED": self.operation_metrics_enabled,
"OPERATION_METRICS_INTERVAL_MINUTES": self.operation_metrics_interval_minutes,
"OPERATION_METRICS_RETENTION_HOURS": self.operation_metrics_retention_hours,
"SERVER_THREADS": self.server_threads,
"SERVER_CONNECTION_LIMIT": self.server_connection_limit,
"SERVER_BACKLOG": self.server_backlog,
"SERVER_CHANNEL_TIMEOUT": self.server_channel_timeout,
"SERVER_MAX_BUFFER_SIZE": self.server_max_buffer_size,
"SITE_SYNC_ENABLED": self.site_sync_enabled,
"SITE_SYNC_INTERVAL_SECONDS": self.site_sync_interval_seconds,
"SITE_SYNC_BATCH_SIZE": self.site_sync_batch_size,
"SIGV4_TIMESTAMP_TOLERANCE_SECONDS": self.sigv4_timestamp_tolerance_seconds,
"PRESIGNED_URL_MIN_EXPIRY_SECONDS": self.presigned_url_min_expiry_seconds,
"PRESIGNED_URL_MAX_EXPIRY_SECONDS": self.presigned_url_max_expiry_seconds,
"REPLICATION_CONNECT_TIMEOUT_SECONDS": self.replication_connect_timeout_seconds,
"REPLICATION_READ_TIMEOUT_SECONDS": self.replication_read_timeout_seconds,
"REPLICATION_MAX_RETRIES": self.replication_max_retries,
"REPLICATION_STREAMING_THRESHOLD_BYTES": self.replication_streaming_threshold_bytes,
"REPLICATION_MAX_FAILURES_PER_BUCKET": self.replication_max_failures_per_bucket,
"SITE_SYNC_CONNECT_TIMEOUT_SECONDS": self.site_sync_connect_timeout_seconds,
"SITE_SYNC_READ_TIMEOUT_SECONDS": self.site_sync_read_timeout_seconds,
"SITE_SYNC_MAX_RETRIES": self.site_sync_max_retries,
"SITE_SYNC_CLOCK_SKEW_TOLERANCE_SECONDS": self.site_sync_clock_skew_tolerance_seconds,
"OBJECT_KEY_MAX_LENGTH_BYTES": self.object_key_max_length_bytes,
"OBJECT_CACHE_MAX_SIZE": self.object_cache_max_size,
"META_READ_CACHE_MAX": self.meta_read_cache_max,
"BUCKET_CONFIG_CACHE_TTL_SECONDS": self.bucket_config_cache_ttl_seconds,
"OBJECT_TAG_LIMIT": self.object_tag_limit,
"ENCRYPTION_CHUNK_SIZE_BYTES": self.encryption_chunk_size_bytes,
"KMS_GENERATE_DATA_KEY_MIN_BYTES": self.kms_generate_data_key_min_bytes,
"KMS_GENERATE_DATA_KEY_MAX_BYTES": self.kms_generate_data_key_max_bytes,
"LIFECYCLE_MAX_HISTORY_PER_BUCKET": self.lifecycle_max_history_per_bucket,
"SITE_ID": self.site_id,
"SITE_ENDPOINT": self.site_endpoint,
"SITE_REGION": self.site_region,
"SITE_PRIORITY": self.site_priority,
"RATE_LIMIT_ADMIN": self.ratelimit_admin,
"NUM_TRUSTED_PROXIES": self.num_trusted_proxies,
"ALLOWED_REDIRECT_HOSTS": self.allowed_redirect_hosts,
"ALLOW_INTERNAL_ENDPOINTS": self.allow_internal_endpoints,
"WEBSITE_HOSTING_ENABLED": self.website_hosting_enabled,
"GC_ENABLED": self.gc_enabled,
"GC_INTERVAL_HOURS": self.gc_interval_hours,
"GC_TEMP_FILE_MAX_AGE_HOURS": self.gc_temp_file_max_age_hours,
"GC_MULTIPART_MAX_AGE_DAYS": self.gc_multipart_max_age_days,
"GC_LOCK_FILE_MAX_AGE_HOURS": self.gc_lock_file_max_age_hours,
"GC_DRY_RUN": self.gc_dry_run,
"GC_IO_THROTTLE_MS": self.gc_io_throttle_ms,
"INTEGRITY_ENABLED": self.integrity_enabled,
"INTEGRITY_INTERVAL_HOURS": self.integrity_interval_hours,
"INTEGRITY_BATCH_SIZE": self.integrity_batch_size,
"INTEGRITY_AUTO_HEAL": self.integrity_auto_heal,
"INTEGRITY_DRY_RUN": self.integrity_dry_run,
"INTEGRITY_IO_THROTTLE_MS": self.integrity_io_throttle_ms,
}

View File

@@ -1,4 +1,3 @@
"""Manage remote S3 connections."""
from __future__ import annotations
import json

293
app/encrypted_storage.py Normal file
View File

@@ -0,0 +1,293 @@
from __future__ import annotations
import io
from pathlib import Path
from typing import Any, BinaryIO, Dict, Optional
from .encryption import EncryptionManager, EncryptionMetadata, EncryptionError
from .storage import ObjectStorage, ObjectMeta, StorageError
class EncryptedObjectStorage:
"""Object storage with transparent server-side encryption.
This class wraps ObjectStorage and provides transparent encryption/decryption
of objects based on bucket encryption configuration.
Encryption is applied when:
1. Bucket has default encryption configured (SSE-S3 or SSE-KMS)
2. Client explicitly requests encryption via headers
The encryption metadata is stored alongside object metadata.
"""
STREAMING_THRESHOLD = 64 * 1024
def __init__(self, storage: ObjectStorage, encryption_manager: EncryptionManager):
self.storage = storage
self.encryption = encryption_manager
@property
def root(self) -> Path:
return self.storage.root
def _should_encrypt(self, bucket_name: str,
server_side_encryption: str | None = None) -> tuple[bool, str, str | None]:
"""Determine if object should be encrypted.
Returns:
Tuple of (should_encrypt, algorithm, kms_key_id)
"""
if not self.encryption.enabled:
return False, "", None
if server_side_encryption:
if server_side_encryption == "AES256":
return True, "AES256", None
elif server_side_encryption.startswith("aws:kms"):
parts = server_side_encryption.split(":")
kms_key_id = parts[2] if len(parts) > 2 else None
return True, "aws:kms", kms_key_id
try:
encryption_config = self.storage.get_bucket_encryption(bucket_name)
if encryption_config and encryption_config.get("Rules"):
rule = encryption_config["Rules"][0]
# AWS format: Rules[].ApplyServerSideEncryptionByDefault.SSEAlgorithm
sse_default = rule.get("ApplyServerSideEncryptionByDefault", {})
algorithm = sse_default.get("SSEAlgorithm", "AES256")
kms_key_id = sse_default.get("KMSMasterKeyID")
return True, algorithm, kms_key_id
except StorageError:
pass
return False, "", None
def _is_encrypted(self, metadata: Dict[str, str]) -> bool:
"""Check if object is encrypted based on its metadata."""
return "x-amz-server-side-encryption" in metadata
def put_object(
self,
bucket_name: str,
object_key: str,
stream: BinaryIO,
*,
metadata: Optional[Dict[str, str]] = None,
server_side_encryption: Optional[str] = None,
kms_key_id: Optional[str] = None,
) -> ObjectMeta:
"""Store an object, optionally with encryption.
Args:
bucket_name: Name of the bucket
object_key: Key for the object
stream: Binary stream of object data
metadata: Optional user metadata
server_side_encryption: Encryption algorithm ("AES256" or "aws:kms")
kms_key_id: KMS key ID (for aws:kms encryption)
Returns:
ObjectMeta with object information
Performance: Uses streaming encryption for large files to reduce memory usage.
"""
should_encrypt, algorithm, detected_kms_key = self._should_encrypt(
bucket_name, server_side_encryption
)
if kms_key_id is None:
kms_key_id = detected_kms_key
if should_encrypt:
try:
# Performance: Use streaming encryption to avoid loading entire file into memory
encrypted_stream, enc_metadata = self.encryption.encrypt_stream(
stream,
algorithm=algorithm,
context={"bucket": bucket_name, "key": object_key},
)
combined_metadata = metadata.copy() if metadata else {}
combined_metadata.update(enc_metadata.to_dict())
result = self.storage.put_object(
bucket_name,
object_key,
encrypted_stream,
metadata=combined_metadata,
)
result.metadata = combined_metadata
return result
except EncryptionError as exc:
raise StorageError(f"Encryption failed: {exc}") from exc
else:
return self.storage.put_object(
bucket_name,
object_key,
stream,
metadata=metadata,
)
def get_object_data(self, bucket_name: str, object_key: str) -> tuple[bytes, Dict[str, str]]:
"""Get object data, decrypting if necessary.
Returns:
Tuple of (data, metadata)
Performance: Uses streaming decryption to reduce memory usage.
"""
path = self.storage.get_object_path(bucket_name, object_key)
metadata = self.storage.get_object_metadata(bucket_name, object_key)
enc_metadata = EncryptionMetadata.from_dict(metadata)
if enc_metadata:
try:
# Performance: Use streaming decryption to avoid loading entire file into memory
with path.open("rb") as f:
decrypted_stream = self.encryption.decrypt_stream(f, enc_metadata)
data = decrypted_stream.read()
except EncryptionError as exc:
raise StorageError(f"Decryption failed: {exc}") from exc
else:
with path.open("rb") as f:
data = f.read()
clean_metadata = {
k: v for k, v in metadata.items()
if not k.startswith("x-amz-encryption")
and k != "x-amz-encrypted-data-key"
}
return data, clean_metadata
def get_object_stream(self, bucket_name: str, object_key: str) -> tuple[BinaryIO, Dict[str, str], int]:
"""Get object as a stream, decrypting if necessary.
Returns:
Tuple of (stream, metadata, original_size)
"""
data, metadata = self.get_object_data(bucket_name, object_key)
return io.BytesIO(data), metadata, len(data)
def list_buckets(self):
return self.storage.list_buckets()
def bucket_exists(self, bucket_name: str) -> bool:
return self.storage.bucket_exists(bucket_name)
def create_bucket(self, bucket_name: str) -> None:
return self.storage.create_bucket(bucket_name)
def delete_bucket(self, bucket_name: str) -> None:
return self.storage.delete_bucket(bucket_name)
def bucket_stats(self, bucket_name: str, cache_ttl: int = 60):
return self.storage.bucket_stats(bucket_name, cache_ttl)
def list_objects(self, bucket_name: str, **kwargs):
return self.storage.list_objects(bucket_name, **kwargs)
def list_objects_shallow(self, bucket_name: str, **kwargs):
return self.storage.list_objects_shallow(bucket_name, **kwargs)
def iter_objects_shallow(self, bucket_name: str, **kwargs):
return self.storage.iter_objects_shallow(bucket_name, **kwargs)
def search_objects(self, bucket_name: str, query: str, **kwargs):
return self.storage.search_objects(bucket_name, query, **kwargs)
def list_objects_all(self, bucket_name: str):
return self.storage.list_objects_all(bucket_name)
def get_object_path(self, bucket_name: str, object_key: str):
return self.storage.get_object_path(bucket_name, object_key)
def get_object_metadata(self, bucket_name: str, object_key: str):
return self.storage.get_object_metadata(bucket_name, object_key)
def delete_object(self, bucket_name: str, object_key: str) -> None:
return self.storage.delete_object(bucket_name, object_key)
def purge_object(self, bucket_name: str, object_key: str) -> None:
return self.storage.purge_object(bucket_name, object_key)
def is_versioning_enabled(self, bucket_name: str) -> bool:
return self.storage.is_versioning_enabled(bucket_name)
def set_bucket_versioning(self, bucket_name: str, enabled: bool) -> None:
return self.storage.set_bucket_versioning(bucket_name, enabled)
def get_bucket_tags(self, bucket_name: str):
return self.storage.get_bucket_tags(bucket_name)
def set_bucket_tags(self, bucket_name: str, tags):
return self.storage.set_bucket_tags(bucket_name, tags)
def get_bucket_cors(self, bucket_name: str):
return self.storage.get_bucket_cors(bucket_name)
def set_bucket_cors(self, bucket_name: str, rules):
return self.storage.set_bucket_cors(bucket_name, rules)
def get_bucket_encryption(self, bucket_name: str):
return self.storage.get_bucket_encryption(bucket_name)
def set_bucket_encryption(self, bucket_name: str, config_payload):
return self.storage.set_bucket_encryption(bucket_name, config_payload)
def get_bucket_lifecycle(self, bucket_name: str):
return self.storage.get_bucket_lifecycle(bucket_name)
def set_bucket_lifecycle(self, bucket_name: str, rules):
return self.storage.set_bucket_lifecycle(bucket_name, rules)
def get_object_tags(self, bucket_name: str, object_key: str):
return self.storage.get_object_tags(bucket_name, object_key)
def set_object_tags(self, bucket_name: str, object_key: str, tags):
return self.storage.set_object_tags(bucket_name, object_key, tags)
def delete_object_tags(self, bucket_name: str, object_key: str):
return self.storage.delete_object_tags(bucket_name, object_key)
def list_object_versions(self, bucket_name: str, object_key: str):
return self.storage.list_object_versions(bucket_name, object_key)
def restore_object_version(self, bucket_name: str, object_key: str, version_id: str):
return self.storage.restore_object_version(bucket_name, object_key, version_id)
def list_orphaned_objects(self, bucket_name: str):
return self.storage.list_orphaned_objects(bucket_name)
def initiate_multipart_upload(self, bucket_name: str, object_key: str, *, metadata=None) -> str:
return self.storage.initiate_multipart_upload(bucket_name, object_key, metadata=metadata)
def upload_multipart_part(self, bucket_name: str, upload_id: str, part_number: int, stream: BinaryIO) -> str:
return self.storage.upload_multipart_part(bucket_name, upload_id, part_number, stream)
def complete_multipart_upload(self, bucket_name: str, upload_id: str, ordered_parts):
return self.storage.complete_multipart_upload(bucket_name, upload_id, ordered_parts)
def abort_multipart_upload(self, bucket_name: str, upload_id: str) -> None:
return self.storage.abort_multipart_upload(bucket_name, upload_id)
def list_multipart_parts(self, bucket_name: str, upload_id: str):
return self.storage.list_multipart_parts(bucket_name, upload_id)
def get_bucket_quota(self, bucket_name: str):
return self.storage.get_bucket_quota(bucket_name)
def set_bucket_quota(self, bucket_name: str, *, max_bytes=None, max_objects=None):
return self.storage.set_bucket_quota(bucket_name, max_bytes=max_bytes, max_objects=max_objects)
def get_bucket_website(self, bucket_name: str):
return self.storage.get_bucket_website(bucket_name)
def set_bucket_website(self, bucket_name: str, website_config):
return self.storage.set_bucket_website(bucket_name, website_config)
def _compute_etag(self, path: Path) -> str:
return self.storage._compute_etag(path)

653
app/encryption.py Normal file
View File

@@ -0,0 +1,653 @@
from __future__ import annotations
import base64
import io
import json
import logging
import os
import secrets
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, BinaryIO, Dict, Generator, Optional
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from cryptography.hazmat.primitives.kdf.hkdf import HKDF
from cryptography.hazmat.primitives import hashes
if sys.platform != "win32":
import fcntl
try:
import myfsio_core as _rc
if not all(hasattr(_rc, f) for f in (
"encrypt_stream_chunked", "decrypt_stream_chunked",
)):
raise ImportError("myfsio_core is outdated, rebuild with: cd myfsio_core && maturin develop --release")
_HAS_RUST = True
except ImportError:
_rc = None
_HAS_RUST = False
logger = logging.getLogger(__name__)
def _set_secure_file_permissions(file_path: Path) -> None:
"""Set restrictive file permissions (owner read/write only)."""
if sys.platform == "win32":
try:
username = os.environ.get("USERNAME", "")
if username:
subprocess.run(
["icacls", str(file_path), "/inheritance:r",
"/grant:r", f"{username}:F"],
check=True, capture_output=True
)
else:
logger.warning("Could not set secure permissions on %s: USERNAME not set", file_path)
except (subprocess.SubprocessError, OSError) as exc:
logger.warning("Failed to set secure permissions on %s: %s", file_path, exc)
else:
os.chmod(file_path, 0o600)
class EncryptionError(Exception):
"""Raised when encryption/decryption fails."""
@dataclass
class EncryptionResult:
"""Result of encrypting data."""
ciphertext: bytes
nonce: bytes
key_id: str
encrypted_data_key: bytes
@dataclass
class EncryptionMetadata:
"""Metadata stored with encrypted objects."""
algorithm: str
key_id: str
nonce: bytes
encrypted_data_key: bytes
def to_dict(self) -> Dict[str, str]:
return {
"x-amz-server-side-encryption": self.algorithm,
"x-amz-encryption-key-id": self.key_id,
"x-amz-encryption-nonce": base64.b64encode(self.nonce).decode(),
"x-amz-encrypted-data-key": base64.b64encode(self.encrypted_data_key).decode(),
}
@classmethod
def from_dict(cls, data: Dict[str, str]) -> Optional["EncryptionMetadata"]:
algorithm = data.get("x-amz-server-side-encryption")
if not algorithm:
return None
try:
return cls(
algorithm=algorithm,
key_id=data.get("x-amz-encryption-key-id", "local"),
nonce=base64.b64decode(data.get("x-amz-encryption-nonce", "")),
encrypted_data_key=base64.b64decode(data.get("x-amz-encrypted-data-key", "")),
)
except Exception:
return None
class EncryptionProvider:
"""Base class for encryption providers."""
def encrypt(self, plaintext: bytes, context: Dict[str, str] | None = None) -> EncryptionResult:
raise NotImplementedError
def decrypt(self, ciphertext: bytes, nonce: bytes, encrypted_data_key: bytes,
key_id: str, context: Dict[str, str] | None = None) -> bytes:
raise NotImplementedError
def generate_data_key(self) -> tuple[bytes, bytes]:
"""Generate a data key and its encrypted form.
Returns:
Tuple of (plaintext_key, encrypted_key)
"""
raise NotImplementedError
def decrypt_data_key(self, encrypted_data_key: bytes, key_id: str | None = None) -> bytes:
"""Decrypt an encrypted data key.
Args:
encrypted_data_key: The encrypted data key bytes
key_id: Optional key identifier (used by KMS providers)
Returns:
The decrypted data key
"""
raise NotImplementedError
class LocalKeyEncryption(EncryptionProvider):
"""SSE-S3 style encryption using a local master key.
Uses envelope encryption:
1. Generate a unique data key for each object
2. Encrypt the data with the data key (AES-256-GCM)
3. Encrypt the data key with the master key
4. Store the encrypted data key alongside the ciphertext
"""
KEY_ID = "local"
def __init__(self, master_key_path: Path):
self.master_key_path = master_key_path
self._master_key: bytes | None = None
@property
def master_key(self) -> bytes:
if self._master_key is None:
self._master_key = self._load_or_create_master_key()
return self._master_key
def _load_or_create_master_key(self) -> bytes:
"""Load master key from file or generate a new one (with file locking)."""
lock_path = self.master_key_path.with_suffix(".lock")
lock_path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(lock_path, "w") as lock_file:
if sys.platform == "win32":
import msvcrt
msvcrt.locking(lock_file.fileno(), msvcrt.LK_LOCK, 1)
else:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
try:
if self.master_key_path.exists():
try:
return base64.b64decode(self.master_key_path.read_text().strip())
except Exception as exc:
raise EncryptionError(f"Failed to load master key: {exc}") from exc
key = secrets.token_bytes(32)
try:
self.master_key_path.write_text(base64.b64encode(key).decode())
_set_secure_file_permissions(self.master_key_path)
except OSError as exc:
raise EncryptionError(f"Failed to save master key: {exc}") from exc
return key
finally:
if sys.platform == "win32":
import msvcrt
msvcrt.locking(lock_file.fileno(), msvcrt.LK_UNLCK, 1)
else:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
except OSError as exc:
raise EncryptionError(f"Failed to acquire lock for master key: {exc}") from exc
DATA_KEY_AAD = b'{"purpose":"data_key","version":1}'
def _encrypt_data_key(self, data_key: bytes) -> bytes:
"""Encrypt the data key with the master key."""
aesgcm = AESGCM(self.master_key)
nonce = secrets.token_bytes(12)
encrypted = aesgcm.encrypt(nonce, data_key, self.DATA_KEY_AAD)
return nonce + encrypted
def _decrypt_data_key(self, encrypted_data_key: bytes) -> bytes:
"""Decrypt the data key using the master key."""
if len(encrypted_data_key) < 12 + 32 + 16: # nonce + key + tag
raise EncryptionError("Invalid encrypted data key")
aesgcm = AESGCM(self.master_key)
nonce = encrypted_data_key[:12]
ciphertext = encrypted_data_key[12:]
try:
return aesgcm.decrypt(nonce, ciphertext, self.DATA_KEY_AAD)
except Exception:
try:
return aesgcm.decrypt(nonce, ciphertext, None)
except Exception as exc:
raise EncryptionError(f"Failed to decrypt data key: {exc}") from exc
def decrypt_data_key(self, encrypted_data_key: bytes, key_id: str | None = None) -> bytes:
"""Decrypt an encrypted data key (key_id ignored for local encryption)."""
return self._decrypt_data_key(encrypted_data_key)
def generate_data_key(self) -> tuple[bytes, bytes]:
"""Generate a data key and its encrypted form."""
plaintext_key = secrets.token_bytes(32)
encrypted_key = self._encrypt_data_key(plaintext_key)
return plaintext_key, encrypted_key
def encrypt(self, plaintext: bytes, context: Dict[str, str] | None = None) -> EncryptionResult:
"""Encrypt data using envelope encryption."""
data_key, encrypted_data_key = self.generate_data_key()
aesgcm = AESGCM(data_key)
nonce = secrets.token_bytes(12)
aad = json.dumps(context, sort_keys=True).encode() if context else None
ciphertext = aesgcm.encrypt(nonce, plaintext, aad)
return EncryptionResult(
ciphertext=ciphertext,
nonce=nonce,
key_id=self.KEY_ID,
encrypted_data_key=encrypted_data_key,
)
def decrypt(self, ciphertext: bytes, nonce: bytes, encrypted_data_key: bytes,
key_id: str, context: Dict[str, str] | None = None) -> bytes:
"""Decrypt data using envelope encryption."""
data_key = self._decrypt_data_key(encrypted_data_key)
aesgcm = AESGCM(data_key)
aad = json.dumps(context, sort_keys=True).encode() if context else None
try:
return aesgcm.decrypt(nonce, ciphertext, aad)
except Exception as exc:
raise EncryptionError("Failed to decrypt data") from exc
class StreamingEncryptor:
"""Encrypts/decrypts data in streaming fashion for large files.
For large files, we encrypt in chunks. Each chunk is encrypted with the
same data key but a unique nonce derived from the base nonce + chunk index.
"""
CHUNK_SIZE = 64 * 1024
HEADER_SIZE = 4
def __init__(self, provider: EncryptionProvider, chunk_size: int = CHUNK_SIZE):
self.provider = provider
self.chunk_size = chunk_size
def _derive_chunk_nonce(self, base_nonce: bytes, chunk_index: int) -> bytes:
"""Derive a unique nonce for each chunk using HKDF."""
hkdf = HKDF(
algorithm=hashes.SHA256(),
length=12,
salt=base_nonce,
info=chunk_index.to_bytes(4, "big"),
)
return hkdf.derive(b"chunk_nonce")
def encrypt_stream(self, stream: BinaryIO,
context: Dict[str, str] | None = None) -> tuple[BinaryIO, EncryptionMetadata]:
"""Encrypt a stream and return encrypted stream + metadata.
Performance: Writes chunks directly to output buffer instead of accumulating in list.
"""
data_key, encrypted_data_key = self.provider.generate_data_key()
base_nonce = secrets.token_bytes(12)
aesgcm = AESGCM(data_key)
# Performance: Write directly to BytesIO instead of accumulating chunks
output = io.BytesIO()
output.write(b"\x00\x00\x00\x00") # Placeholder for chunk count
chunk_index = 0
while True:
chunk = stream.read(self.chunk_size)
if not chunk:
break
chunk_nonce = self._derive_chunk_nonce(base_nonce, chunk_index)
encrypted_chunk = aesgcm.encrypt(chunk_nonce, chunk, None)
# Write size prefix + encrypted chunk directly
output.write(len(encrypted_chunk).to_bytes(self.HEADER_SIZE, "big"))
output.write(encrypted_chunk)
chunk_index += 1
# Write actual chunk count to header
output.seek(0)
output.write(chunk_index.to_bytes(4, "big"))
output.seek(0)
metadata = EncryptionMetadata(
algorithm="AES256",
key_id=self.provider.KEY_ID if hasattr(self.provider, "KEY_ID") else "local",
nonce=base_nonce,
encrypted_data_key=encrypted_data_key,
)
return output, metadata
def decrypt_stream(self, stream: BinaryIO, metadata: EncryptionMetadata) -> BinaryIO:
"""Decrypt a stream using the provided metadata.
Performance: Writes chunks directly to output buffer instead of accumulating in list.
"""
data_key = self.provider.decrypt_data_key(metadata.encrypted_data_key, metadata.key_id)
aesgcm = AESGCM(data_key)
base_nonce = metadata.nonce
chunk_count_bytes = stream.read(4)
if len(chunk_count_bytes) < 4:
raise EncryptionError("Invalid encrypted stream: missing header")
chunk_count = int.from_bytes(chunk_count_bytes, "big")
# Performance: Write directly to BytesIO instead of accumulating chunks
output = io.BytesIO()
for chunk_index in range(chunk_count):
size_bytes = stream.read(self.HEADER_SIZE)
if len(size_bytes) < self.HEADER_SIZE:
raise EncryptionError(f"Invalid encrypted stream: truncated at chunk {chunk_index}")
chunk_size = int.from_bytes(size_bytes, "big")
encrypted_chunk = stream.read(chunk_size)
if len(encrypted_chunk) < chunk_size:
raise EncryptionError(f"Invalid encrypted stream: incomplete chunk {chunk_index}")
chunk_nonce = self._derive_chunk_nonce(base_nonce, chunk_index)
try:
decrypted_chunk = aesgcm.decrypt(chunk_nonce, encrypted_chunk, None)
output.write(decrypted_chunk) # Write directly instead of appending to list
except Exception as exc:
raise EncryptionError(f"Failed to decrypt chunk {chunk_index}: {exc}") from exc
output.seek(0)
return output
def encrypt_file(self, input_path: str, output_path: str) -> EncryptionMetadata:
data_key, encrypted_data_key = self.provider.generate_data_key()
base_nonce = secrets.token_bytes(12)
if _HAS_RUST:
_rc.encrypt_stream_chunked(
input_path, output_path, data_key, base_nonce, self.chunk_size
)
else:
with open(input_path, "rb") as stream:
aesgcm = AESGCM(data_key)
with open(output_path, "wb") as out:
out.write(b"\x00\x00\x00\x00")
chunk_index = 0
while True:
chunk = stream.read(self.chunk_size)
if not chunk:
break
chunk_nonce = self._derive_chunk_nonce(base_nonce, chunk_index)
encrypted_chunk = aesgcm.encrypt(chunk_nonce, chunk, None)
out.write(len(encrypted_chunk).to_bytes(self.HEADER_SIZE, "big"))
out.write(encrypted_chunk)
chunk_index += 1
out.seek(0)
out.write(chunk_index.to_bytes(4, "big"))
return EncryptionMetadata(
algorithm="AES256",
key_id=self.provider.KEY_ID if hasattr(self.provider, "KEY_ID") else "local",
nonce=base_nonce,
encrypted_data_key=encrypted_data_key,
)
def decrypt_file(self, input_path: str, output_path: str,
metadata: EncryptionMetadata) -> None:
data_key = self.provider.decrypt_data_key(metadata.encrypted_data_key, metadata.key_id)
base_nonce = metadata.nonce
if _HAS_RUST:
_rc.decrypt_stream_chunked(input_path, output_path, data_key, base_nonce)
else:
with open(input_path, "rb") as stream:
chunk_count_bytes = stream.read(4)
if len(chunk_count_bytes) < 4:
raise EncryptionError("Invalid encrypted stream: missing header")
chunk_count = int.from_bytes(chunk_count_bytes, "big")
aesgcm = AESGCM(data_key)
with open(output_path, "wb") as out:
for chunk_index in range(chunk_count):
size_bytes = stream.read(self.HEADER_SIZE)
if len(size_bytes) < self.HEADER_SIZE:
raise EncryptionError(f"Invalid encrypted stream: truncated at chunk {chunk_index}")
chunk_size = int.from_bytes(size_bytes, "big")
encrypted_chunk = stream.read(chunk_size)
if len(encrypted_chunk) < chunk_size:
raise EncryptionError(f"Invalid encrypted stream: incomplete chunk {chunk_index}")
chunk_nonce = self._derive_chunk_nonce(base_nonce, chunk_index)
try:
decrypted_chunk = aesgcm.decrypt(chunk_nonce, encrypted_chunk, None)
out.write(decrypted_chunk)
except Exception as exc:
raise EncryptionError(f"Failed to decrypt chunk {chunk_index}: {exc}") from exc
class EncryptionManager:
"""Manages encryption providers and operations."""
def __init__(self, config: Dict[str, Any]):
self.config = config
self._local_provider: LocalKeyEncryption | None = None
self._kms_provider: Any = None # Set by KMS module
self._streaming_encryptor: StreamingEncryptor | None = None
@property
def enabled(self) -> bool:
return self.config.get("encryption_enabled", False)
@property
def default_algorithm(self) -> str:
return self.config.get("default_encryption_algorithm", "AES256")
def get_local_provider(self) -> LocalKeyEncryption:
if self._local_provider is None:
key_path = Path(self.config.get("encryption_master_key_path", "data/.myfsio.sys/keys/master.key"))
self._local_provider = LocalKeyEncryption(key_path)
return self._local_provider
def set_kms_provider(self, kms_provider: Any) -> None:
"""Set the KMS provider (injected from kms module)."""
self._kms_provider = kms_provider
def get_provider(self, algorithm: str, kms_key_id: str | None = None) -> EncryptionProvider:
"""Get the appropriate encryption provider for the algorithm."""
if algorithm == "AES256":
return self.get_local_provider()
elif algorithm == "aws:kms":
if self._kms_provider is None:
raise EncryptionError("KMS is not configured")
return self._kms_provider.get_provider(kms_key_id)
else:
raise EncryptionError(f"Unsupported encryption algorithm: {algorithm}")
def get_streaming_encryptor(self) -> StreamingEncryptor:
if self._streaming_encryptor is None:
chunk_size = self.config.get("encryption_chunk_size_bytes", 64 * 1024)
self._streaming_encryptor = StreamingEncryptor(self.get_local_provider(), chunk_size=chunk_size)
return self._streaming_encryptor
def encrypt_object(self, data: bytes, algorithm: str = "AES256",
kms_key_id: str | None = None,
context: Dict[str, str] | None = None) -> tuple[bytes, EncryptionMetadata]:
"""Encrypt object data."""
provider = self.get_provider(algorithm, kms_key_id)
result = provider.encrypt(data, context)
metadata = EncryptionMetadata(
algorithm=algorithm,
key_id=result.key_id,
nonce=result.nonce,
encrypted_data_key=result.encrypted_data_key,
)
return result.ciphertext, metadata
def decrypt_object(self, ciphertext: bytes, metadata: EncryptionMetadata,
context: Dict[str, str] | None = None) -> bytes:
"""Decrypt object data."""
provider = self.get_provider(metadata.algorithm, metadata.key_id)
return provider.decrypt(
ciphertext,
metadata.nonce,
metadata.encrypted_data_key,
metadata.key_id,
context,
)
def encrypt_stream(self, stream: BinaryIO, algorithm: str = "AES256",
context: Dict[str, str] | None = None) -> tuple[BinaryIO, EncryptionMetadata]:
"""Encrypt a stream for large files."""
encryptor = self.get_streaming_encryptor()
return encryptor.encrypt_stream(stream, context)
def decrypt_stream(self, stream: BinaryIO, metadata: EncryptionMetadata) -> BinaryIO:
"""Decrypt a stream."""
encryptor = self.get_streaming_encryptor()
return encryptor.decrypt_stream(stream, metadata)
class SSECEncryption(EncryptionProvider):
"""SSE-C: Server-Side Encryption with Customer-Provided Keys.
The client provides the encryption key with each request.
Server encrypts/decrypts but never stores the key.
Required headers for PUT:
- x-amz-server-side-encryption-customer-algorithm: AES256
- x-amz-server-side-encryption-customer-key: Base64-encoded 256-bit key
- x-amz-server-side-encryption-customer-key-MD5: Base64-encoded MD5 of key
"""
KEY_ID = "customer-provided"
def __init__(self, customer_key: bytes):
if len(customer_key) != 32:
raise EncryptionError("Customer key must be exactly 256 bits (32 bytes)")
self.customer_key = customer_key
@classmethod
def from_headers(cls, headers: Dict[str, str]) -> "SSECEncryption":
algorithm = headers.get("x-amz-server-side-encryption-customer-algorithm", "")
if algorithm.upper() != "AES256":
raise EncryptionError(f"Unsupported SSE-C algorithm: {algorithm}. Only AES256 is supported.")
key_b64 = headers.get("x-amz-server-side-encryption-customer-key", "")
if not key_b64:
raise EncryptionError("Missing x-amz-server-side-encryption-customer-key header")
key_md5_b64 = headers.get("x-amz-server-side-encryption-customer-key-md5", "")
try:
customer_key = base64.b64decode(key_b64)
except Exception as e:
raise EncryptionError(f"Invalid base64 in customer key: {e}") from e
if len(customer_key) != 32:
raise EncryptionError(f"Customer key must be 256 bits, got {len(customer_key) * 8} bits")
if key_md5_b64:
import hashlib
expected_md5 = base64.b64encode(hashlib.md5(customer_key).digest()).decode()
if key_md5_b64 != expected_md5:
raise EncryptionError("Customer key MD5 mismatch")
return cls(customer_key)
def encrypt(self, plaintext: bytes, context: Dict[str, str] | None = None) -> EncryptionResult:
aesgcm = AESGCM(self.customer_key)
nonce = secrets.token_bytes(12)
aad = json.dumps(context, sort_keys=True).encode() if context else None
ciphertext = aesgcm.encrypt(nonce, plaintext, aad)
return EncryptionResult(
ciphertext=ciphertext,
nonce=nonce,
key_id=self.KEY_ID,
encrypted_data_key=b"",
)
def decrypt(self, ciphertext: bytes, nonce: bytes, encrypted_data_key: bytes,
key_id: str, context: Dict[str, str] | None = None) -> bytes:
aesgcm = AESGCM(self.customer_key)
aad = json.dumps(context, sort_keys=True).encode() if context else None
try:
return aesgcm.decrypt(nonce, ciphertext, aad)
except Exception as exc:
raise EncryptionError("SSE-C decryption failed") from exc
def generate_data_key(self) -> tuple[bytes, bytes]:
return self.customer_key, b""
@dataclass
class SSECMetadata:
algorithm: str = "AES256"
nonce: bytes = b""
key_md5: str = ""
def to_dict(self) -> Dict[str, str]:
return {
"x-amz-server-side-encryption-customer-algorithm": self.algorithm,
"x-amz-encryption-nonce": base64.b64encode(self.nonce).decode(),
"x-amz-server-side-encryption-customer-key-MD5": self.key_md5,
}
@classmethod
def from_dict(cls, data: Dict[str, str]) -> Optional["SSECMetadata"]:
algorithm = data.get("x-amz-server-side-encryption-customer-algorithm")
if not algorithm:
return None
try:
nonce = base64.b64decode(data.get("x-amz-encryption-nonce", ""))
return cls(
algorithm=algorithm,
nonce=nonce,
key_md5=data.get("x-amz-server-side-encryption-customer-key-MD5", ""),
)
except Exception:
return None
class ClientEncryptionHelper:
"""Helpers for client-side encryption.
Client-side encryption is performed by the client, but this helper
provides key generation and materials for clients that need them.
"""
@staticmethod
def generate_client_key() -> Dict[str, str]:
"""Generate a new client encryption key."""
from datetime import datetime, timezone
key = secrets.token_bytes(32)
return {
"key": base64.b64encode(key).decode(),
"algorithm": "AES-256-GCM",
"created_at": datetime.now(timezone.utc).isoformat(),
}
@staticmethod
def encrypt_with_key(plaintext: bytes, key_b64: str, context: Dict[str, str] | None = None) -> Dict[str, str]:
"""Encrypt data with a client-provided key."""
key = base64.b64decode(key_b64)
if len(key) != 32:
raise EncryptionError("Key must be 256 bits (32 bytes)")
aesgcm = AESGCM(key)
nonce = secrets.token_bytes(12)
aad = json.dumps(context, sort_keys=True).encode() if context else None
ciphertext = aesgcm.encrypt(nonce, plaintext, aad)
return {
"ciphertext": base64.b64encode(ciphertext).decode(),
"nonce": base64.b64encode(nonce).decode(),
"algorithm": "AES-256-GCM",
}
@staticmethod
def decrypt_with_key(ciphertext_b64: str, nonce_b64: str, key_b64: str, context: Dict[str, str] | None = None) -> bytes:
"""Decrypt data with a client-provided key."""
key = base64.b64decode(key_b64)
nonce = base64.b64decode(nonce_b64)
ciphertext = base64.b64decode(ciphertext_b64)
if len(key) != 32:
raise EncryptionError("Key must be 256 bits (32 bytes)")
aesgcm = AESGCM(key)
aad = json.dumps(context, sort_keys=True).encode() if context else None
try:
return aesgcm.decrypt(nonce, ciphertext, aad)
except Exception as exc:
raise EncryptionError("Decryption failed") from exc

207
app/errors.py Normal file
View File

@@ -0,0 +1,207 @@
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Optional, Dict, Any
from xml.etree.ElementTree import Element, SubElement, tostring
from flask import Response, jsonify, request, flash, redirect, url_for, g
from flask_limiter import RateLimitExceeded
logger = logging.getLogger(__name__)
@dataclass
class AppError(Exception):
"""Base application error with multi-format response support."""
code: str
message: str
status_code: int = 500
details: Optional[Dict[str, Any]] = field(default=None)
def __post_init__(self):
super().__init__(self.message)
def to_xml_response(self) -> Response:
"""Convert to S3 API XML error response."""
error = Element("Error")
SubElement(error, "Code").text = self.code
SubElement(error, "Message").text = self.message
request_id = getattr(g, 'request_id', None) if g else None
SubElement(error, "RequestId").text = request_id or "unknown"
xml_bytes = tostring(error, encoding="utf-8")
return Response(xml_bytes, status=self.status_code, mimetype="application/xml")
def to_json_response(self) -> tuple[Response, int]:
"""Convert to JSON error response for UI AJAX calls."""
payload: Dict[str, Any] = {
"success": False,
"error": {
"code": self.code,
"message": self.message
}
}
if self.details:
payload["error"]["details"] = self.details
return jsonify(payload), self.status_code
def to_flash_message(self) -> str:
"""Convert to user-friendly flash message."""
return self.message
@dataclass
class BucketNotFoundError(AppError):
"""Bucket does not exist."""
code: str = "NoSuchBucket"
message: str = "The specified bucket does not exist"
status_code: int = 404
@dataclass
class BucketAlreadyExistsError(AppError):
"""Bucket already exists."""
code: str = "BucketAlreadyExists"
message: str = "The requested bucket name is not available"
status_code: int = 409
@dataclass
class BucketNotEmptyError(AppError):
"""Bucket is not empty."""
code: str = "BucketNotEmpty"
message: str = "The bucket you tried to delete is not empty"
status_code: int = 409
@dataclass
class ObjectNotFoundError(AppError):
"""Object does not exist."""
code: str = "NoSuchKey"
message: str = "The specified key does not exist"
status_code: int = 404
@dataclass
class InvalidObjectKeyError(AppError):
"""Invalid object key."""
code: str = "InvalidKey"
message: str = "The specified key is not valid"
status_code: int = 400
@dataclass
class AccessDeniedError(AppError):
"""Access denied."""
code: str = "AccessDenied"
message: str = "Access Denied"
status_code: int = 403
@dataclass
class InvalidCredentialsError(AppError):
"""Invalid credentials."""
code: str = "InvalidAccessKeyId"
message: str = "The access key ID you provided does not exist"
status_code: int = 403
@dataclass
class MalformedRequestError(AppError):
"""Malformed request."""
code: str = "MalformedXML"
message: str = "The XML you provided was not well-formed"
status_code: int = 400
@dataclass
class InvalidArgumentError(AppError):
"""Invalid argument."""
code: str = "InvalidArgument"
message: str = "Invalid argument"
status_code: int = 400
@dataclass
class EntityTooLargeError(AppError):
"""Entity too large."""
code: str = "EntityTooLarge"
message: str = "Your proposed upload exceeds the maximum allowed size"
status_code: int = 413
@dataclass
class QuotaExceededAppError(AppError):
"""Bucket quota exceeded."""
code: str = "QuotaExceeded"
message: str = "The bucket quota has been exceeded"
status_code: int = 403
quota: Optional[Dict[str, Any]] = None
usage: Optional[Dict[str, int]] = None
def __post_init__(self):
if self.quota or self.usage:
self.details = {}
if self.quota:
self.details["quota"] = self.quota
if self.usage:
self.details["usage"] = self.usage
super().__post_init__()
def handle_app_error(error: AppError) -> Response:
"""Handle application errors with appropriate response format."""
log_extra = {"error_code": error.code}
if error.details:
log_extra["details"] = error.details
logger.error(f"{error.code}: {error.message}", extra=log_extra)
if request.path.startswith('/ui'):
wants_json = (
request.is_json or
request.headers.get('X-Requested-With') == 'XMLHttpRequest' or
'application/json' in request.accept_mimetypes.values()
)
if wants_json:
return error.to_json_response()
flash(error.to_flash_message(), 'danger')
referrer = request.referrer
if referrer and request.host in referrer:
return redirect(referrer)
return redirect(url_for('ui.buckets_overview'))
else:
return error.to_xml_response()
def handle_rate_limit_exceeded(e: RateLimitExceeded) -> Response:
g.s3_error_code = "SlowDown"
if request.path.startswith("/ui") or request.path.startswith("/buckets"):
wants_json = (
request.is_json or
request.headers.get("X-Requested-With") == "XMLHttpRequest" or
"application/json" in request.accept_mimetypes.values()
)
if wants_json:
return jsonify({"success": False, "error": {"code": "SlowDown", "message": "Please reduce your request rate."}}), 429
error = Element("Error")
SubElement(error, "Code").text = "SlowDown"
SubElement(error, "Message").text = "Please reduce your request rate."
SubElement(error, "Resource").text = request.path
SubElement(error, "RequestId").text = getattr(g, "request_id", "")
xml_bytes = tostring(error, encoding="utf-8")
return Response(xml_bytes, status="429 Too Many Requests", mimetype="application/xml")
def register_error_handlers(app):
"""Register error handlers with a Flask app."""
app.register_error_handler(AppError, handle_app_error)
app.register_error_handler(RateLimitExceeded, handle_rate_limit_exceeded)
for error_class in [
BucketNotFoundError, BucketAlreadyExistsError, BucketNotEmptyError,
ObjectNotFoundError, InvalidObjectKeyError,
AccessDeniedError, InvalidCredentialsError,
MalformedRequestError, InvalidArgumentError, EntityTooLargeError,
QuotaExceededAppError,
]:
app.register_error_handler(error_class, handle_app_error)

View File

@@ -1,10 +1,16 @@
"""Application-wide extension instances."""
from flask import g
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
from flask_wtf import CSRFProtect
def get_rate_limit_key():
"""Generate rate limit key based on authenticated user."""
if hasattr(g, 'principal') and g.principal:
return g.principal.access_key
return get_remote_address()
# Shared rate limiter instance; configured in app factory.
limiter = Limiter(key_func=get_remote_address)
limiter = Limiter(key_func=get_rate_limit_key)
# Global CSRF protection for UI routes.
csrf = CSRFProtect()

596
app/gc.py Normal file
View File

@@ -0,0 +1,596 @@
from __future__ import annotations
import json
import logging
import os
import shutil
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class GCResult:
temp_files_deleted: int = 0
temp_bytes_freed: int = 0
multipart_uploads_deleted: int = 0
multipart_bytes_freed: int = 0
lock_files_deleted: int = 0
orphaned_metadata_deleted: int = 0
orphaned_versions_deleted: int = 0
orphaned_version_bytes_freed: int = 0
empty_dirs_removed: int = 0
errors: List[str] = field(default_factory=list)
execution_time_seconds: float = 0.0
def to_dict(self) -> dict:
return {
"temp_files_deleted": self.temp_files_deleted,
"temp_bytes_freed": self.temp_bytes_freed,
"multipart_uploads_deleted": self.multipart_uploads_deleted,
"multipart_bytes_freed": self.multipart_bytes_freed,
"lock_files_deleted": self.lock_files_deleted,
"orphaned_metadata_deleted": self.orphaned_metadata_deleted,
"orphaned_versions_deleted": self.orphaned_versions_deleted,
"orphaned_version_bytes_freed": self.orphaned_version_bytes_freed,
"empty_dirs_removed": self.empty_dirs_removed,
"errors": self.errors,
"execution_time_seconds": self.execution_time_seconds,
}
@property
def total_bytes_freed(self) -> int:
return self.temp_bytes_freed + self.multipart_bytes_freed + self.orphaned_version_bytes_freed
@property
def has_work(self) -> bool:
return (
self.temp_files_deleted > 0
or self.multipart_uploads_deleted > 0
or self.lock_files_deleted > 0
or self.orphaned_metadata_deleted > 0
or self.orphaned_versions_deleted > 0
or self.empty_dirs_removed > 0
)
@dataclass
class GCExecutionRecord:
timestamp: float
result: dict
dry_run: bool
def to_dict(self) -> dict:
return {
"timestamp": self.timestamp,
"result": self.result,
"dry_run": self.dry_run,
}
@classmethod
def from_dict(cls, data: dict) -> GCExecutionRecord:
return cls(
timestamp=data["timestamp"],
result=data["result"],
dry_run=data.get("dry_run", False),
)
class GCHistoryStore:
def __init__(self, storage_root: Path, max_records: int = 50) -> None:
self.storage_root = storage_root
self.max_records = max_records
self._lock = threading.Lock()
def _get_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "gc_history.json"
def load(self) -> List[GCExecutionRecord]:
path = self._get_path()
if not path.exists():
return []
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return [GCExecutionRecord.from_dict(d) for d in data.get("executions", [])]
except (OSError, ValueError, KeyError) as e:
logger.error("Failed to load GC history: %s", e)
return []
def save(self, records: List[GCExecutionRecord]) -> None:
path = self._get_path()
path.parent.mkdir(parents=True, exist_ok=True)
data = {"executions": [r.to_dict() for r in records[: self.max_records]]}
try:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error("Failed to save GC history: %s", e)
def add(self, record: GCExecutionRecord) -> None:
with self._lock:
records = self.load()
records.insert(0, record)
self.save(records)
def get_history(self, limit: int = 50, offset: int = 0) -> List[GCExecutionRecord]:
return self.load()[offset : offset + limit]
def _dir_size(path: Path) -> int:
total = 0
try:
for f in path.rglob("*"):
if f.is_file():
try:
total += f.stat().st_size
except OSError:
pass
except OSError:
pass
return total
def _file_age_hours(path: Path) -> float:
try:
mtime = path.stat().st_mtime
return (time.time() - mtime) / 3600.0
except OSError:
return 0.0
class GarbageCollector:
SYSTEM_ROOT = ".myfsio.sys"
SYSTEM_TMP_DIR = "tmp"
SYSTEM_MULTIPART_DIR = "multipart"
SYSTEM_BUCKETS_DIR = "buckets"
BUCKET_META_DIR = "meta"
BUCKET_VERSIONS_DIR = "versions"
INTERNAL_FOLDERS = {".meta", ".versions", ".multipart"}
def __init__(
self,
storage_root: Path,
interval_hours: float = 6.0,
temp_file_max_age_hours: float = 24.0,
multipart_max_age_days: int = 7,
lock_file_max_age_hours: float = 1.0,
dry_run: bool = False,
max_history: int = 50,
io_throttle_ms: int = 10,
) -> None:
self.storage_root = Path(storage_root)
self.interval_seconds = interval_hours * 3600.0
self.temp_file_max_age_hours = temp_file_max_age_hours
self.multipart_max_age_days = multipart_max_age_days
self.lock_file_max_age_hours = lock_file_max_age_hours
self.dry_run = dry_run
self._timer: Optional[threading.Timer] = None
self._shutdown = False
self._lock = threading.Lock()
self._scanning = False
self._scan_start_time: Optional[float] = None
self._io_throttle = max(0, io_throttle_ms) / 1000.0
self.history_store = GCHistoryStore(storage_root, max_records=max_history)
def start(self) -> None:
if self._timer is not None:
return
self._shutdown = False
self._schedule_next()
logger.info(
"GC started: interval=%.1fh, temp_max_age=%.1fh, multipart_max_age=%dd, lock_max_age=%.1fh, dry_run=%s",
self.interval_seconds / 3600.0,
self.temp_file_max_age_hours,
self.multipart_max_age_days,
self.lock_file_max_age_hours,
self.dry_run,
)
def stop(self) -> None:
self._shutdown = True
if self._timer:
self._timer.cancel()
self._timer = None
logger.info("GC stopped")
def _schedule_next(self) -> None:
if self._shutdown:
return
self._timer = threading.Timer(self.interval_seconds, self._run_cycle)
self._timer.daemon = True
self._timer.start()
def _run_cycle(self) -> None:
if self._shutdown:
return
try:
self.run_now()
except Exception as e:
logger.error("GC cycle failed: %s", e)
finally:
self._schedule_next()
def run_now(self, dry_run: Optional[bool] = None) -> GCResult:
if not self._lock.acquire(blocking=False):
raise RuntimeError("GC is already in progress")
effective_dry_run = dry_run if dry_run is not None else self.dry_run
try:
self._scanning = True
self._scan_start_time = time.time()
start = self._scan_start_time
result = GCResult()
original_dry_run = self.dry_run
self.dry_run = effective_dry_run
try:
self._clean_temp_files(result)
self._clean_orphaned_multipart(result)
self._clean_stale_locks(result)
self._clean_orphaned_metadata(result)
self._clean_orphaned_versions(result)
self._clean_empty_dirs(result)
finally:
self.dry_run = original_dry_run
result.execution_time_seconds = time.time() - start
if result.has_work or result.errors:
logger.info(
"GC completed in %.2fs: temp=%d (%.1f MB), multipart=%d (%.1f MB), "
"locks=%d, meta=%d, versions=%d (%.1f MB), dirs=%d, errors=%d%s",
result.execution_time_seconds,
result.temp_files_deleted,
result.temp_bytes_freed / (1024 * 1024),
result.multipart_uploads_deleted,
result.multipart_bytes_freed / (1024 * 1024),
result.lock_files_deleted,
result.orphaned_metadata_deleted,
result.orphaned_versions_deleted,
result.orphaned_version_bytes_freed / (1024 * 1024),
result.empty_dirs_removed,
len(result.errors),
" (dry run)" if effective_dry_run else "",
)
record = GCExecutionRecord(
timestamp=time.time(),
result=result.to_dict(),
dry_run=effective_dry_run,
)
self.history_store.add(record)
return result
finally:
self._scanning = False
self._scan_start_time = None
self._lock.release()
def run_async(self, dry_run: Optional[bool] = None) -> bool:
if self._scanning:
return False
t = threading.Thread(target=self.run_now, args=(dry_run,), daemon=True)
t.start()
return True
def _system_path(self) -> Path:
return self.storage_root / self.SYSTEM_ROOT
def _throttle(self) -> bool:
if self._shutdown:
return True
if self._io_throttle > 0:
time.sleep(self._io_throttle)
return self._shutdown
def _list_bucket_names(self) -> List[str]:
names = []
try:
for entry in self.storage_root.iterdir():
if entry.is_dir() and entry.name != self.SYSTEM_ROOT:
names.append(entry.name)
except OSError:
pass
return names
def _clean_temp_files(self, result: GCResult) -> None:
tmp_dir = self._system_path() / self.SYSTEM_TMP_DIR
if not tmp_dir.exists():
return
try:
for entry in tmp_dir.iterdir():
if self._throttle():
return
if not entry.is_file():
continue
age = _file_age_hours(entry)
if age < self.temp_file_max_age_hours:
continue
try:
size = entry.stat().st_size
if not self.dry_run:
entry.unlink()
result.temp_files_deleted += 1
result.temp_bytes_freed += size
except OSError as e:
result.errors.append(f"temp file {entry.name}: {e}")
except OSError as e:
result.errors.append(f"scan tmp dir: {e}")
def _clean_orphaned_multipart(self, result: GCResult) -> None:
cutoff_hours = self.multipart_max_age_days * 24.0
bucket_names = self._list_bucket_names()
for bucket_name in bucket_names:
if self._shutdown:
return
for multipart_root in (
self._system_path() / self.SYSTEM_MULTIPART_DIR / bucket_name,
self.storage_root / bucket_name / ".multipart",
):
if not multipart_root.exists():
continue
try:
for upload_dir in multipart_root.iterdir():
if self._throttle():
return
if not upload_dir.is_dir():
continue
self._maybe_clean_upload(upload_dir, cutoff_hours, result)
except OSError as e:
result.errors.append(f"scan multipart {bucket_name}: {e}")
def _maybe_clean_upload(self, upload_dir: Path, cutoff_hours: float, result: GCResult) -> None:
manifest_path = upload_dir / "manifest.json"
age = _file_age_hours(manifest_path) if manifest_path.exists() else _file_age_hours(upload_dir)
if age < cutoff_hours:
return
dir_bytes = _dir_size(upload_dir)
try:
if not self.dry_run:
shutil.rmtree(upload_dir, ignore_errors=True)
result.multipart_uploads_deleted += 1
result.multipart_bytes_freed += dir_bytes
except OSError as e:
result.errors.append(f"multipart {upload_dir.name}: {e}")
def _clean_stale_locks(self, result: GCResult) -> None:
buckets_root = self._system_path() / self.SYSTEM_BUCKETS_DIR
if not buckets_root.exists():
return
try:
for bucket_dir in buckets_root.iterdir():
if self._shutdown:
return
if not bucket_dir.is_dir():
continue
locks_dir = bucket_dir / "locks"
if not locks_dir.exists():
continue
try:
for lock_file in locks_dir.iterdir():
if self._throttle():
return
if not lock_file.is_file() or not lock_file.name.endswith(".lock"):
continue
age = _file_age_hours(lock_file)
if age < self.lock_file_max_age_hours:
continue
try:
if not self.dry_run:
lock_file.unlink(missing_ok=True)
result.lock_files_deleted += 1
except OSError as e:
result.errors.append(f"lock {lock_file.name}: {e}")
except OSError as e:
result.errors.append(f"scan locks {bucket_dir.name}: {e}")
except OSError as e:
result.errors.append(f"scan buckets for locks: {e}")
def _clean_orphaned_metadata(self, result: GCResult) -> None:
bucket_names = self._list_bucket_names()
for bucket_name in bucket_names:
if self._shutdown:
return
legacy_meta = self.storage_root / bucket_name / ".meta"
if legacy_meta.exists():
self._clean_legacy_metadata(bucket_name, legacy_meta, result)
new_meta = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
if new_meta.exists():
self._clean_index_metadata(bucket_name, new_meta, result)
def _clean_legacy_metadata(self, bucket_name: str, meta_root: Path, result: GCResult) -> None:
bucket_path = self.storage_root / bucket_name
try:
for meta_file in meta_root.rglob("*.meta.json"):
if self._throttle():
return
if not meta_file.is_file():
continue
try:
rel = meta_file.relative_to(meta_root)
object_key = rel.as_posix().removesuffix(".meta.json")
object_path = bucket_path / object_key
if not object_path.exists():
if not self.dry_run:
meta_file.unlink(missing_ok=True)
result.orphaned_metadata_deleted += 1
except (OSError, ValueError) as e:
result.errors.append(f"legacy meta {bucket_name}/{meta_file.name}: {e}")
except OSError as e:
result.errors.append(f"scan legacy meta {bucket_name}: {e}")
def _clean_index_metadata(self, bucket_name: str, meta_root: Path, result: GCResult) -> None:
bucket_path = self.storage_root / bucket_name
try:
for index_file in meta_root.rglob("_index.json"):
if self._throttle():
return
if not index_file.is_file():
continue
try:
with open(index_file, "r", encoding="utf-8") as f:
index_data = json.load(f)
except (OSError, json.JSONDecodeError):
continue
keys_to_remove = []
for key in index_data:
rel_dir = index_file.parent.relative_to(meta_root)
if rel_dir == Path("."):
full_key = key
else:
full_key = rel_dir.as_posix() + "/" + key
object_path = bucket_path / full_key
if not object_path.exists():
keys_to_remove.append(key)
if keys_to_remove:
if not self.dry_run:
for k in keys_to_remove:
index_data.pop(k, None)
if index_data:
try:
with open(index_file, "w", encoding="utf-8") as f:
json.dump(index_data, f)
except OSError as e:
result.errors.append(f"write index {bucket_name}: {e}")
continue
else:
try:
index_file.unlink(missing_ok=True)
except OSError:
pass
result.orphaned_metadata_deleted += len(keys_to_remove)
except OSError as e:
result.errors.append(f"scan index meta {bucket_name}: {e}")
def _clean_orphaned_versions(self, result: GCResult) -> None:
bucket_names = self._list_bucket_names()
for bucket_name in bucket_names:
if self._shutdown:
return
bucket_path = self.storage_root / bucket_name
for versions_root in (
self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_VERSIONS_DIR,
self.storage_root / bucket_name / ".versions",
):
if not versions_root.exists():
continue
try:
for key_dir in versions_root.iterdir():
if self._throttle():
return
if not key_dir.is_dir():
continue
self._clean_versions_for_key(bucket_path, versions_root, key_dir, result)
except OSError as e:
result.errors.append(f"scan versions {bucket_name}: {e}")
def _clean_versions_for_key(
self, bucket_path: Path, versions_root: Path, key_dir: Path, result: GCResult
) -> None:
try:
rel = key_dir.relative_to(versions_root)
except ValueError:
return
object_path = bucket_path / rel
if object_path.exists():
return
version_files = list(key_dir.glob("*.bin")) + list(key_dir.glob("*.json"))
if not version_files:
return
for vf in version_files:
try:
size = vf.stat().st_size if vf.suffix == ".bin" else 0
if not self.dry_run:
vf.unlink(missing_ok=True)
if vf.suffix == ".bin":
result.orphaned_version_bytes_freed += size
result.orphaned_versions_deleted += 1
except OSError as e:
result.errors.append(f"version file {vf.name}: {e}")
def _clean_empty_dirs(self, result: GCResult) -> None:
targets = [
self._system_path() / self.SYSTEM_TMP_DIR,
self._system_path() / self.SYSTEM_MULTIPART_DIR,
self._system_path() / self.SYSTEM_BUCKETS_DIR,
]
for bucket_name in self._list_bucket_names():
targets.append(self.storage_root / bucket_name / ".meta")
targets.append(self.storage_root / bucket_name / ".versions")
targets.append(self.storage_root / bucket_name / ".multipart")
for root in targets:
if not root.exists():
continue
self._remove_empty_dirs_recursive(root, root, result)
def _remove_empty_dirs_recursive(self, path: Path, stop_at: Path, result: GCResult) -> bool:
if self._shutdown:
return False
if not path.is_dir():
return False
try:
children = list(path.iterdir())
except OSError:
return False
all_empty = True
for child in children:
if self._throttle():
return False
if child.is_dir():
if not self._remove_empty_dirs_recursive(child, stop_at, result):
all_empty = False
else:
all_empty = False
if all_empty and path != stop_at:
try:
if not self.dry_run:
path.rmdir()
result.empty_dirs_removed += 1
return True
except OSError:
return False
return all_empty
def get_history(self, limit: int = 50, offset: int = 0) -> List[dict]:
records = self.history_store.get_history(limit, offset)
return [r.to_dict() for r in records]
def get_status(self) -> dict:
status: Dict[str, Any] = {
"enabled": not self._shutdown or self._timer is not None,
"running": self._timer is not None and not self._shutdown,
"scanning": self._scanning,
"interval_hours": self.interval_seconds / 3600.0,
"temp_file_max_age_hours": self.temp_file_max_age_hours,
"multipart_max_age_days": self.multipart_max_age_days,
"lock_file_max_age_hours": self.lock_file_max_age_hours,
"dry_run": self.dry_run,
"io_throttle_ms": round(self._io_throttle * 1000),
}
if self._scanning and self._scan_start_time:
status["scan_elapsed_seconds"] = time.time() - self._scan_start_time
return status

File diff suppressed because it is too large Load Diff

995
app/integrity.py Normal file
View File

@@ -0,0 +1,995 @@
from __future__ import annotations
import hashlib
import json
import logging
import os
import threading
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
try:
import myfsio_core as _rc
if not hasattr(_rc, "md5_file"):
raise ImportError("myfsio_core is outdated, rebuild with: cd myfsio_core && maturin develop --release")
_HAS_RUST = True
except ImportError:
_HAS_RUST = False
logger = logging.getLogger(__name__)
def _compute_etag(path: Path) -> str:
if _HAS_RUST:
return _rc.md5_file(str(path))
checksum = hashlib.md5()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(8192), b""):
checksum.update(chunk)
return checksum.hexdigest()
@dataclass
class IntegrityIssue:
issue_type: str
bucket: str
key: str
detail: str
healed: bool = False
heal_action: str = ""
def to_dict(self) -> dict:
return {
"issue_type": self.issue_type,
"bucket": self.bucket,
"key": self.key,
"detail": self.detail,
"healed": self.healed,
"heal_action": self.heal_action,
}
@dataclass
class IntegrityResult:
corrupted_objects: int = 0
orphaned_objects: int = 0
phantom_metadata: int = 0
stale_versions: int = 0
etag_cache_inconsistencies: int = 0
legacy_metadata_drifts: int = 0
issues_healed: int = 0
issues: List[IntegrityIssue] = field(default_factory=list)
errors: List[str] = field(default_factory=list)
objects_scanned: int = 0
buckets_scanned: int = 0
execution_time_seconds: float = 0.0
def to_dict(self) -> dict:
return {
"corrupted_objects": self.corrupted_objects,
"orphaned_objects": self.orphaned_objects,
"phantom_metadata": self.phantom_metadata,
"stale_versions": self.stale_versions,
"etag_cache_inconsistencies": self.etag_cache_inconsistencies,
"legacy_metadata_drifts": self.legacy_metadata_drifts,
"issues_healed": self.issues_healed,
"issues": [i.to_dict() for i in self.issues],
"errors": self.errors,
"objects_scanned": self.objects_scanned,
"buckets_scanned": self.buckets_scanned,
"execution_time_seconds": self.execution_time_seconds,
}
@property
def total_issues(self) -> int:
return (
self.corrupted_objects
+ self.orphaned_objects
+ self.phantom_metadata
+ self.stale_versions
+ self.etag_cache_inconsistencies
+ self.legacy_metadata_drifts
)
@property
def has_issues(self) -> bool:
return self.total_issues > 0
@dataclass
class IntegrityExecutionRecord:
timestamp: float
result: dict
dry_run: bool
auto_heal: bool
def to_dict(self) -> dict:
return {
"timestamp": self.timestamp,
"result": self.result,
"dry_run": self.dry_run,
"auto_heal": self.auto_heal,
}
@classmethod
def from_dict(cls, data: dict) -> IntegrityExecutionRecord:
return cls(
timestamp=data["timestamp"],
result=data["result"],
dry_run=data.get("dry_run", False),
auto_heal=data.get("auto_heal", False),
)
class IntegrityHistoryStore:
def __init__(self, storage_root: Path, max_records: int = 50) -> None:
self.storage_root = storage_root
self.max_records = max_records
self._lock = threading.Lock()
def _get_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "integrity_history.json"
def load(self) -> List[IntegrityExecutionRecord]:
path = self._get_path()
if not path.exists():
return []
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return [IntegrityExecutionRecord.from_dict(d) for d in data.get("executions", [])]
except (OSError, ValueError, KeyError) as e:
logger.error("Failed to load integrity history: %s", e)
return []
def save(self, records: List[IntegrityExecutionRecord]) -> None:
path = self._get_path()
path.parent.mkdir(parents=True, exist_ok=True)
data = {"executions": [r.to_dict() for r in records[: self.max_records]]}
try:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error("Failed to save integrity history: %s", e)
def add(self, record: IntegrityExecutionRecord) -> None:
with self._lock:
records = self.load()
records.insert(0, record)
self.save(records)
def get_history(self, limit: int = 50, offset: int = 0) -> List[IntegrityExecutionRecord]:
return self.load()[offset : offset + limit]
class IntegrityCursorStore:
def __init__(self, storage_root: Path) -> None:
self.storage_root = storage_root
self._lock = threading.Lock()
def _get_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "integrity_cursor.json"
def load(self) -> Dict[str, Any]:
path = self._get_path()
if not path.exists():
return {"buckets": {}}
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data.get("buckets"), dict):
return {"buckets": {}}
return data
except (OSError, ValueError, KeyError):
return {"buckets": {}}
def save(self, data: Dict[str, Any]) -> None:
path = self._get_path()
path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error("Failed to save integrity cursor: %s", e)
def update_bucket(
self,
bucket_name: str,
timestamp: float,
last_key: Optional[str] = None,
completed: bool = False,
) -> None:
with self._lock:
data = self.load()
entry = data["buckets"].get(bucket_name, {})
if completed:
entry["last_scanned"] = timestamp
entry.pop("last_key", None)
entry["completed"] = True
else:
entry["last_scanned"] = timestamp
if last_key is not None:
entry["last_key"] = last_key
entry["completed"] = False
data["buckets"][bucket_name] = entry
self.save(data)
def clean_stale(self, existing_buckets: List[str]) -> None:
with self._lock:
data = self.load()
existing_set = set(existing_buckets)
stale_keys = [k for k in data["buckets"] if k not in existing_set]
if stale_keys:
for k in stale_keys:
del data["buckets"][k]
self.save(data)
def get_last_key(self, bucket_name: str) -> Optional[str]:
data = self.load()
entry = data.get("buckets", {}).get(bucket_name)
if entry is None:
return None
return entry.get("last_key")
def get_bucket_order(self, bucket_names: List[str]) -> List[str]:
data = self.load()
buckets_info = data.get("buckets", {})
incomplete = []
complete = []
for name in bucket_names:
entry = buckets_info.get(name)
if entry is None:
incomplete.append((name, 0.0))
elif entry.get("last_key") is not None:
incomplete.append((name, entry.get("last_scanned", 0.0)))
else:
complete.append((name, entry.get("last_scanned", 0.0)))
incomplete.sort(key=lambda x: x[1])
complete.sort(key=lambda x: x[1])
return [n for n, _ in incomplete] + [n for n, _ in complete]
def get_info(self) -> Dict[str, Any]:
data = self.load()
buckets = data.get("buckets", {})
return {
"tracked_buckets": len(buckets),
"buckets": {
name: {
"last_scanned": info.get("last_scanned"),
"last_key": info.get("last_key"),
"completed": info.get("completed", False),
}
for name, info in buckets.items()
},
}
MAX_ISSUES = 500
class IntegrityChecker:
SYSTEM_ROOT = ".myfsio.sys"
SYSTEM_BUCKETS_DIR = "buckets"
BUCKET_META_DIR = "meta"
BUCKET_VERSIONS_DIR = "versions"
INTERNAL_FOLDERS = {".meta", ".versions", ".multipart"}
def __init__(
self,
storage_root: Path,
interval_hours: float = 24.0,
batch_size: int = 1000,
auto_heal: bool = False,
dry_run: bool = False,
max_history: int = 50,
io_throttle_ms: int = 10,
) -> None:
self.storage_root = Path(storage_root)
self.interval_seconds = interval_hours * 3600.0
self.batch_size = batch_size
self.auto_heal = auto_heal
self.dry_run = dry_run
self._timer: Optional[threading.Timer] = None
self._shutdown = False
self._lock = threading.Lock()
self._scanning = False
self._scan_start_time: Optional[float] = None
self._io_throttle = max(0, io_throttle_ms) / 1000.0
self.history_store = IntegrityHistoryStore(storage_root, max_records=max_history)
self.cursor_store = IntegrityCursorStore(self.storage_root)
def start(self) -> None:
if self._timer is not None:
return
self._shutdown = False
self._schedule_next()
logger.info(
"Integrity checker started: interval=%.1fh, batch_size=%d, auto_heal=%s, dry_run=%s",
self.interval_seconds / 3600.0,
self.batch_size,
self.auto_heal,
self.dry_run,
)
def stop(self) -> None:
self._shutdown = True
if self._timer:
self._timer.cancel()
self._timer = None
logger.info("Integrity checker stopped")
def _schedule_next(self) -> None:
if self._shutdown:
return
self._timer = threading.Timer(self.interval_seconds, self._run_cycle)
self._timer.daemon = True
self._timer.start()
def _run_cycle(self) -> None:
if self._shutdown:
return
try:
self.run_now()
except Exception as e:
logger.error("Integrity check cycle failed: %s", e)
finally:
self._schedule_next()
def run_now(self, auto_heal: Optional[bool] = None, dry_run: Optional[bool] = None) -> IntegrityResult:
if not self._lock.acquire(blocking=False):
raise RuntimeError("Integrity scan is already in progress")
try:
self._scanning = True
self._scan_start_time = time.time()
effective_auto_heal = auto_heal if auto_heal is not None else self.auto_heal
effective_dry_run = dry_run if dry_run is not None else self.dry_run
start = self._scan_start_time
result = IntegrityResult()
bucket_names = self._list_bucket_names()
self.cursor_store.clean_stale(bucket_names)
ordered_buckets = self.cursor_store.get_bucket_order(bucket_names)
for bucket_name in ordered_buckets:
if self._batch_exhausted(result):
break
result.buckets_scanned += 1
cursor_key = self.cursor_store.get_last_key(bucket_name)
key_corrupted = self._check_corrupted_objects(bucket_name, result, effective_auto_heal, effective_dry_run, cursor_key)
key_orphaned = self._check_orphaned_objects(bucket_name, result, effective_auto_heal, effective_dry_run, cursor_key)
key_phantom = self._check_phantom_metadata(bucket_name, result, effective_auto_heal, effective_dry_run, cursor_key)
self._check_stale_versions(bucket_name, result, effective_auto_heal, effective_dry_run)
self._check_etag_cache(bucket_name, result, effective_auto_heal, effective_dry_run)
self._check_legacy_metadata(bucket_name, result, effective_auto_heal, effective_dry_run)
returned_keys = [k for k in (key_corrupted, key_orphaned, key_phantom) if k is not None]
bucket_exhausted = self._batch_exhausted(result)
if bucket_exhausted and returned_keys:
self.cursor_store.update_bucket(bucket_name, time.time(), last_key=min(returned_keys))
else:
self.cursor_store.update_bucket(bucket_name, time.time(), completed=True)
result.execution_time_seconds = time.time() - start
if result.has_issues or result.errors:
logger.info(
"Integrity check completed in %.2fs: corrupted=%d, orphaned=%d, phantom=%d, "
"stale_versions=%d, etag_cache=%d, legacy_drift=%d, healed=%d, errors=%d%s",
result.execution_time_seconds,
result.corrupted_objects,
result.orphaned_objects,
result.phantom_metadata,
result.stale_versions,
result.etag_cache_inconsistencies,
result.legacy_metadata_drifts,
result.issues_healed,
len(result.errors),
" (dry run)" if effective_dry_run else "",
)
record = IntegrityExecutionRecord(
timestamp=time.time(),
result=result.to_dict(),
dry_run=effective_dry_run,
auto_heal=effective_auto_heal,
)
self.history_store.add(record)
return result
finally:
self._scanning = False
self._scan_start_time = None
self._lock.release()
def run_async(self, auto_heal: Optional[bool] = None, dry_run: Optional[bool] = None) -> bool:
if self._scanning:
return False
t = threading.Thread(target=self.run_now, args=(auto_heal, dry_run), daemon=True)
t.start()
return True
def _system_path(self) -> Path:
return self.storage_root / self.SYSTEM_ROOT
def _list_bucket_names(self) -> List[str]:
names = []
try:
for entry in self.storage_root.iterdir():
if entry.is_dir() and entry.name != self.SYSTEM_ROOT:
names.append(entry.name)
except OSError:
pass
return names
def _throttle(self) -> bool:
if self._shutdown:
return True
if self._io_throttle > 0:
time.sleep(self._io_throttle)
return self._shutdown
def _batch_exhausted(self, result: IntegrityResult) -> bool:
return self._shutdown or result.objects_scanned >= self.batch_size
def _add_issue(self, result: IntegrityResult, issue: IntegrityIssue) -> None:
if len(result.issues) < MAX_ISSUES:
result.issues.append(issue)
def _collect_index_keys(
self, meta_root: Path, cursor_key: Optional[str] = None,
) -> Dict[str, Dict[str, Any]]:
all_keys: Dict[str, Dict[str, Any]] = {}
if not meta_root.exists():
return all_keys
try:
for index_file in meta_root.rglob("_index.json"):
if not index_file.is_file():
continue
rel_dir = index_file.parent.relative_to(meta_root)
dir_prefix = "" if rel_dir == Path(".") else rel_dir.as_posix()
if cursor_key is not None and dir_prefix:
full_prefix = dir_prefix + "/"
if not cursor_key.startswith(full_prefix) and cursor_key > full_prefix:
continue
try:
index_data = json.loads(index_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
continue
for key_name, entry in index_data.items():
full_key = (dir_prefix + "/" + key_name) if dir_prefix else key_name
if cursor_key is not None and full_key <= cursor_key:
continue
all_keys[full_key] = {
"entry": entry,
"index_file": index_file,
"key_name": key_name,
}
except OSError:
pass
return all_keys
def _walk_bucket_files_sorted(
self, bucket_path: Path, cursor_key: Optional[str] = None,
):
def _walk(dir_path: Path, prefix: str):
try:
entries = list(os.scandir(dir_path))
except OSError:
return
def _sort_key(e):
if e.is_dir(follow_symlinks=False):
return e.name + "/"
return e.name
entries.sort(key=_sort_key)
for entry in entries:
if entry.is_dir(follow_symlinks=False):
if not prefix and entry.name in self.INTERNAL_FOLDERS:
continue
new_prefix = (prefix + "/" + entry.name) if prefix else entry.name
if cursor_key is not None:
full_prefix = new_prefix + "/"
if not cursor_key.startswith(full_prefix) and cursor_key > full_prefix:
continue
yield from _walk(Path(entry.path), new_prefix)
elif entry.is_file(follow_symlinks=False):
full_key = (prefix + "/" + entry.name) if prefix else entry.name
if cursor_key is not None and full_key <= cursor_key:
continue
yield full_key
yield from _walk(bucket_path, "")
def _check_corrupted_objects(
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool,
cursor_key: Optional[str] = None,
) -> Optional[str]:
if self._batch_exhausted(result):
return None
bucket_path = self.storage_root / bucket_name
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
if not meta_root.exists():
return None
last_key = None
try:
all_keys = self._collect_index_keys(meta_root, cursor_key)
sorted_keys = sorted(all_keys.keys())
for full_key in sorted_keys:
if self._throttle():
return last_key
if self._batch_exhausted(result):
return last_key
info = all_keys[full_key]
entry = info["entry"]
index_file = info["index_file"]
key_name = info["key_name"]
object_path = bucket_path / full_key
if not object_path.exists():
continue
result.objects_scanned += 1
last_key = full_key
meta = entry.get("metadata", {}) if isinstance(entry, dict) else {}
stored_etag = meta.get("__etag__")
if not stored_etag:
continue
try:
actual_etag = _compute_etag(object_path)
except OSError:
continue
if actual_etag != stored_etag:
result.corrupted_objects += 1
issue = IntegrityIssue(
issue_type="corrupted_object",
bucket=bucket_name,
key=full_key,
detail=f"stored_etag={stored_etag} actual_etag={actual_etag}",
)
if auto_heal and not dry_run:
try:
stat = object_path.stat()
meta["__etag__"] = actual_etag
meta["__size__"] = str(stat.st_size)
meta["__last_modified__"] = str(stat.st_mtime)
try:
index_data = json.loads(index_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
index_data = {}
index_data[key_name] = {"metadata": meta}
self._atomic_write_index(index_file, index_data)
issue.healed = True
issue.heal_action = "updated etag in index"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal corrupted {bucket_name}/{full_key}: {e}")
self._add_issue(result, issue)
except OSError as e:
result.errors.append(f"check corrupted {bucket_name}: {e}")
return last_key
def _check_orphaned_objects(
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool,
cursor_key: Optional[str] = None,
) -> Optional[str]:
if self._batch_exhausted(result):
return None
bucket_path = self.storage_root / bucket_name
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
last_key = None
try:
for full_key in self._walk_bucket_files_sorted(bucket_path, cursor_key):
if self._throttle():
return last_key
if self._batch_exhausted(result):
return last_key
result.objects_scanned += 1
last_key = full_key
key_path = Path(full_key)
key_name = key_path.name
parent = key_path.parent
if parent == Path("."):
index_path = meta_root / "_index.json"
else:
index_path = meta_root / parent / "_index.json"
has_entry = False
if index_path.exists():
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
has_entry = key_name in index_data
except (OSError, json.JSONDecodeError):
pass
if not has_entry:
result.orphaned_objects += 1
issue = IntegrityIssue(
issue_type="orphaned_object",
bucket=bucket_name,
key=full_key,
detail="file exists without metadata entry",
)
if auto_heal and not dry_run:
try:
object_path = bucket_path / full_key
etag = _compute_etag(object_path)
stat = object_path.stat()
meta = {
"__etag__": etag,
"__size__": str(stat.st_size),
"__last_modified__": str(stat.st_mtime),
}
index_data = {}
if index_path.exists():
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
index_data[key_name] = {"metadata": meta}
self._atomic_write_index(index_path, index_data)
issue.healed = True
issue.heal_action = "created metadata entry"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal orphaned {bucket_name}/{full_key}: {e}")
self._add_issue(result, issue)
except OSError as e:
result.errors.append(f"check orphaned {bucket_name}: {e}")
return last_key
def _check_phantom_metadata(
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool,
cursor_key: Optional[str] = None,
) -> Optional[str]:
if self._batch_exhausted(result):
return None
bucket_path = self.storage_root / bucket_name
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
if not meta_root.exists():
return None
last_key = None
try:
all_keys = self._collect_index_keys(meta_root, cursor_key)
sorted_keys = sorted(all_keys.keys())
heal_by_index: Dict[Path, List[str]] = {}
for full_key in sorted_keys:
if self._batch_exhausted(result):
break
result.objects_scanned += 1
last_key = full_key
object_path = bucket_path / full_key
if not object_path.exists():
result.phantom_metadata += 1
info = all_keys[full_key]
issue = IntegrityIssue(
issue_type="phantom_metadata",
bucket=bucket_name,
key=full_key,
detail="metadata entry without file on disk",
)
if auto_heal and not dry_run:
index_file = info["index_file"]
heal_by_index.setdefault(index_file, []).append(info["key_name"])
issue.healed = True
issue.heal_action = "removed stale index entry"
result.issues_healed += 1
self._add_issue(result, issue)
if heal_by_index and auto_heal and not dry_run:
for index_file, keys_to_remove in heal_by_index.items():
try:
index_data = json.loads(index_file.read_text(encoding="utf-8"))
for k in keys_to_remove:
index_data.pop(k, None)
if index_data:
self._atomic_write_index(index_file, index_data)
else:
index_file.unlink(missing_ok=True)
except OSError as e:
result.errors.append(f"heal phantom {bucket_name}: {e}")
except OSError as e:
result.errors.append(f"check phantom {bucket_name}: {e}")
return last_key
def _check_stale_versions(
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
) -> None:
if self._batch_exhausted(result):
return
versions_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_VERSIONS_DIR
if not versions_root.exists():
return
try:
for key_dir in versions_root.rglob("*"):
if self._throttle():
return
if self._batch_exhausted(result):
return
if not key_dir.is_dir():
continue
bin_files = {f.stem: f for f in key_dir.glob("*.bin")}
json_files = {f.stem: f for f in key_dir.glob("*.json")}
for stem, bin_file in bin_files.items():
if self._batch_exhausted(result):
return
result.objects_scanned += 1
if stem not in json_files:
result.stale_versions += 1
issue = IntegrityIssue(
issue_type="stale_version",
bucket=bucket_name,
key=f"{key_dir.relative_to(versions_root).as_posix()}/{bin_file.name}",
detail="version data without manifest",
)
if auto_heal and not dry_run:
try:
bin_file.unlink(missing_ok=True)
issue.healed = True
issue.heal_action = "removed orphaned version data"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal stale version {bin_file}: {e}")
self._add_issue(result, issue)
for stem, json_file in json_files.items():
if self._batch_exhausted(result):
return
result.objects_scanned += 1
if stem not in bin_files:
result.stale_versions += 1
issue = IntegrityIssue(
issue_type="stale_version",
bucket=bucket_name,
key=f"{key_dir.relative_to(versions_root).as_posix()}/{json_file.name}",
detail="version manifest without data",
)
if auto_heal and not dry_run:
try:
json_file.unlink(missing_ok=True)
issue.healed = True
issue.heal_action = "removed orphaned version manifest"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal stale version {json_file}: {e}")
self._add_issue(result, issue)
except OSError as e:
result.errors.append(f"check stale versions {bucket_name}: {e}")
def _check_etag_cache(
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
) -> None:
if self._batch_exhausted(result):
return
etag_index_path = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / "etag_index.json"
if not etag_index_path.exists():
return
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
if not meta_root.exists():
return
try:
etag_cache = json.loads(etag_index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return
found_mismatch = False
for full_key, cached_etag in etag_cache.items():
if self._batch_exhausted(result):
break
result.objects_scanned += 1
key_path = Path(full_key)
key_name = key_path.name
parent = key_path.parent
if parent == Path("."):
index_path = meta_root / "_index.json"
else:
index_path = meta_root / parent / "_index.json"
if not index_path.exists():
continue
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
continue
entry = index_data.get(key_name)
if not entry:
continue
meta = entry.get("metadata", {}) if isinstance(entry, dict) else {}
stored_etag = meta.get("__etag__")
if stored_etag and cached_etag != stored_etag:
result.etag_cache_inconsistencies += 1
found_mismatch = True
issue = IntegrityIssue(
issue_type="etag_cache_inconsistency",
bucket=bucket_name,
key=full_key,
detail=f"cached_etag={cached_etag} index_etag={stored_etag}",
)
self._add_issue(result, issue)
if found_mismatch and auto_heal and not dry_run:
try:
etag_index_path.unlink(missing_ok=True)
for issue in result.issues:
if issue.issue_type == "etag_cache_inconsistency" and issue.bucket == bucket_name and not issue.healed:
issue.healed = True
issue.heal_action = "deleted etag_index.json"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal etag cache {bucket_name}: {e}")
def _check_legacy_metadata(
self, bucket_name: str, result: IntegrityResult, auto_heal: bool, dry_run: bool
) -> None:
if self._batch_exhausted(result):
return
legacy_meta_root = self.storage_root / bucket_name / ".meta"
if not legacy_meta_root.exists():
return
meta_root = self._system_path() / self.SYSTEM_BUCKETS_DIR / bucket_name / self.BUCKET_META_DIR
try:
for meta_file in legacy_meta_root.rglob("*.meta.json"):
if self._throttle():
return
if self._batch_exhausted(result):
return
if not meta_file.is_file():
continue
result.objects_scanned += 1
try:
rel = meta_file.relative_to(legacy_meta_root)
except ValueError:
continue
full_key = rel.as_posix().removesuffix(".meta.json")
key_path = Path(full_key)
key_name = key_path.name
parent = key_path.parent
if parent == Path("."):
index_path = meta_root / "_index.json"
else:
index_path = meta_root / parent / "_index.json"
try:
legacy_data = json.loads(meta_file.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
continue
index_entry = None
if index_path.exists():
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
index_entry = index_data.get(key_name)
except (OSError, json.JSONDecodeError):
pass
if index_entry is None:
result.legacy_metadata_drifts += 1
issue = IntegrityIssue(
issue_type="legacy_metadata_drift",
bucket=bucket_name,
key=full_key,
detail="unmigrated legacy .meta.json",
)
if auto_heal and not dry_run:
try:
index_data = {}
if index_path.exists():
try:
index_data = json.loads(index_path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
pass
index_data[key_name] = {"metadata": legacy_data}
self._atomic_write_index(index_path, index_data)
meta_file.unlink(missing_ok=True)
issue.healed = True
issue.heal_action = "migrated to index and deleted legacy file"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal legacy {bucket_name}/{full_key}: {e}")
self._add_issue(result, issue)
else:
index_meta = index_entry.get("metadata", {}) if isinstance(index_entry, dict) else {}
if legacy_data != index_meta:
result.legacy_metadata_drifts += 1
issue = IntegrityIssue(
issue_type="legacy_metadata_drift",
bucket=bucket_name,
key=full_key,
detail="legacy .meta.json differs from index entry",
)
if auto_heal and not dry_run:
try:
meta_file.unlink(missing_ok=True)
issue.healed = True
issue.heal_action = "deleted legacy file (index is authoritative)"
result.issues_healed += 1
except OSError as e:
result.errors.append(f"heal legacy drift {bucket_name}/{full_key}: {e}")
self._add_issue(result, issue)
except OSError as e:
result.errors.append(f"check legacy meta {bucket_name}: {e}")
@staticmethod
def _atomic_write_index(index_path: Path, data: Dict[str, Any]) -> None:
index_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = index_path.with_suffix(".tmp")
try:
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(data, f)
os.replace(str(tmp_path), str(index_path))
except BaseException:
try:
tmp_path.unlink(missing_ok=True)
except OSError:
pass
raise
def get_history(self, limit: int = 50, offset: int = 0) -> List[dict]:
records = self.history_store.get_history(limit, offset)
return [r.to_dict() for r in records]
def get_status(self) -> dict:
status: Dict[str, Any] = {
"enabled": not self._shutdown or self._timer is not None,
"running": self._timer is not None and not self._shutdown,
"scanning": self._scanning,
"interval_hours": self.interval_seconds / 3600.0,
"batch_size": self.batch_size,
"auto_heal": self.auto_heal,
"dry_run": self.dry_run,
"io_throttle_ms": round(self._io_throttle * 1000),
}
if self._scanning and self._scan_start_time is not None:
status["scan_elapsed_seconds"] = round(time.time() - self._scan_start_time, 1)
status["cursor"] = self.cursor_store.get_info()
return status

422
app/kms.py Normal file
View File

@@ -0,0 +1,422 @@
from __future__ import annotations
import base64
import json
import logging
import os
import secrets
import subprocess
import sys
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from .encryption import EncryptionError, EncryptionProvider, EncryptionResult
if sys.platform != "win32":
import fcntl
logger = logging.getLogger(__name__)
def _set_secure_file_permissions(file_path: Path) -> None:
"""Set restrictive file permissions (owner read/write only)."""
if sys.platform == "win32":
try:
username = os.environ.get("USERNAME", "")
if username:
subprocess.run(
["icacls", str(file_path), "/inheritance:r",
"/grant:r", f"{username}:F"],
check=True, capture_output=True
)
else:
logger.warning("Could not set secure permissions on %s: USERNAME not set", file_path)
except (subprocess.SubprocessError, OSError) as exc:
logger.warning("Failed to set secure permissions on %s: %s", file_path, exc)
else:
os.chmod(file_path, 0o600)
@dataclass
class KMSKey:
"""Represents a KMS encryption key."""
key_id: str
description: str
created_at: str
enabled: bool = True
key_material: bytes = field(default_factory=lambda: b"", repr=False)
@property
def arn(self) -> str:
return f"arn:aws:kms:local:000000000000:key/{self.key_id}"
def to_dict(self, include_key: bool = False) -> Dict[str, Any]:
data = {
"KeyId": self.key_id,
"Arn": self.arn,
"Description": self.description,
"CreationDate": self.created_at,
"Enabled": self.enabled,
"KeyState": "Enabled" if self.enabled else "Disabled",
"KeyUsage": "ENCRYPT_DECRYPT",
"KeySpec": "SYMMETRIC_DEFAULT",
}
if include_key:
data["KeyMaterial"] = base64.b64encode(self.key_material).decode()
return data
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "KMSKey":
key_material = b""
if "KeyMaterial" in data:
key_material = base64.b64decode(data["KeyMaterial"])
return cls(
key_id=data["KeyId"],
description=data.get("Description", ""),
created_at=data.get("CreationDate", datetime.now(timezone.utc).isoformat()),
enabled=data.get("Enabled", True),
key_material=key_material,
)
class KMSEncryptionProvider(EncryptionProvider):
"""Encryption provider using a specific KMS key."""
def __init__(self, kms: "KMSManager", key_id: str):
self.kms = kms
self.key_id = key_id
@property
def KEY_ID(self) -> str:
return self.key_id
def generate_data_key(self) -> tuple[bytes, bytes]:
"""Generate a data key encrypted with the KMS key."""
return self.kms.generate_data_key(self.key_id)
def encrypt(self, plaintext: bytes, context: Dict[str, str] | None = None) -> EncryptionResult:
"""Encrypt data using envelope encryption with KMS."""
data_key, encrypted_data_key = self.generate_data_key()
aesgcm = AESGCM(data_key)
nonce = secrets.token_bytes(12)
ciphertext = aesgcm.encrypt(nonce, plaintext,
json.dumps(context, sort_keys=True).encode() if context else None)
return EncryptionResult(
ciphertext=ciphertext,
nonce=nonce,
key_id=self.key_id,
encrypted_data_key=encrypted_data_key,
)
def decrypt(self, ciphertext: bytes, nonce: bytes, encrypted_data_key: bytes,
key_id: str, context: Dict[str, str] | None = None) -> bytes:
"""Decrypt data using envelope encryption with KMS."""
data_key = self.kms.decrypt_data_key(key_id, encrypted_data_key, context=None)
if len(data_key) != 32:
raise EncryptionError("Invalid data key size")
aesgcm = AESGCM(data_key)
try:
return aesgcm.decrypt(nonce, ciphertext,
json.dumps(context, sort_keys=True).encode() if context else None)
except Exception as exc:
logger.debug("KMS decryption failed: %s", exc)
raise EncryptionError("Failed to decrypt data") from exc
def decrypt_data_key(self, encrypted_data_key: bytes, key_id: str | None = None) -> bytes:
"""Decrypt an encrypted data key using KMS."""
if key_id is None:
key_id = self.key_id
data_key = self.kms.decrypt_data_key(key_id, encrypted_data_key, context=None)
if len(data_key) != 32:
raise EncryptionError("Invalid data key size")
return data_key
class KMSManager:
"""Manages KMS keys and operations.
This is a local implementation that mimics AWS KMS functionality.
Keys are stored encrypted on disk.
"""
def __init__(
self,
keys_path: Path,
master_key_path: Path,
generate_data_key_min_bytes: int = 1,
generate_data_key_max_bytes: int = 1024,
):
self.keys_path = keys_path
self.master_key_path = master_key_path
self.generate_data_key_min_bytes = generate_data_key_min_bytes
self.generate_data_key_max_bytes = generate_data_key_max_bytes
self._keys: Dict[str, KMSKey] = {}
self._master_key: bytes | None = None
self._master_aesgcm: AESGCM | None = None
self._loaded = False
@property
def master_key(self) -> bytes:
"""Load or create the master key for encrypting KMS keys (with file locking)."""
if self._master_key is None:
lock_path = self.master_key_path.with_suffix(".lock")
lock_path.parent.mkdir(parents=True, exist_ok=True)
with open(lock_path, "w") as lock_file:
if sys.platform == "win32":
import msvcrt
msvcrt.locking(lock_file.fileno(), msvcrt.LK_LOCK, 1)
else:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
try:
if self.master_key_path.exists():
self._master_key = base64.b64decode(
self.master_key_path.read_text().strip()
)
else:
self._master_key = secrets.token_bytes(32)
self.master_key_path.write_text(
base64.b64encode(self._master_key).decode()
)
_set_secure_file_permissions(self.master_key_path)
finally:
if sys.platform == "win32":
import msvcrt
msvcrt.locking(lock_file.fileno(), msvcrt.LK_UNLCK, 1)
else:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
self._master_aesgcm = AESGCM(self._master_key)
return self._master_key
def _load_keys(self) -> None:
"""Load keys from disk."""
if self._loaded:
return
if self.keys_path.exists():
try:
data = json.loads(self.keys_path.read_text(encoding="utf-8"))
for key_data in data.get("keys", []):
key = KMSKey.from_dict(key_data)
if key_data.get("EncryptedKeyMaterial"):
encrypted = base64.b64decode(key_data["EncryptedKeyMaterial"])
key.key_material = self._decrypt_key_material(encrypted)
self._keys[key.key_id] = key
except json.JSONDecodeError as exc:
logger.error("Failed to parse KMS keys file: %s", exc)
except (ValueError, KeyError) as exc:
logger.error("Invalid KMS key data: %s", exc)
self._loaded = True
def _save_keys(self) -> None:
"""Save keys to disk (with encrypted key material)."""
keys_data = []
for key in self._keys.values():
data = key.to_dict(include_key=False)
encrypted = self._encrypt_key_material(key.key_material)
data["EncryptedKeyMaterial"] = base64.b64encode(encrypted).decode()
keys_data.append(data)
self.keys_path.parent.mkdir(parents=True, exist_ok=True)
self.keys_path.write_text(
json.dumps({"keys": keys_data}, indent=2),
encoding="utf-8"
)
_set_secure_file_permissions(self.keys_path)
def _encrypt_key_material(self, key_material: bytes) -> bytes:
_ = self.master_key
nonce = secrets.token_bytes(12)
ciphertext = self._master_aesgcm.encrypt(nonce, key_material, None)
return nonce + ciphertext
def _decrypt_key_material(self, encrypted: bytes) -> bytes:
_ = self.master_key
nonce = encrypted[:12]
ciphertext = encrypted[12:]
return self._master_aesgcm.decrypt(nonce, ciphertext, None)
def create_key(self, description: str = "", key_id: str | None = None) -> KMSKey:
"""Create a new KMS key."""
self._load_keys()
if key_id is None:
key_id = str(uuid.uuid4())
if key_id in self._keys:
raise EncryptionError(f"Key already exists: {key_id}")
key = KMSKey(
key_id=key_id,
description=description,
created_at=datetime.now(timezone.utc).isoformat(),
enabled=True,
key_material=secrets.token_bytes(32),
)
self._keys[key_id] = key
self._save_keys()
return key
def get_key(self, key_id: str) -> KMSKey | None:
"""Get a key by ID."""
self._load_keys()
return self._keys.get(key_id)
def list_keys(self) -> List[KMSKey]:
"""List all keys."""
self._load_keys()
return list(self._keys.values())
def get_default_key_id(self) -> str:
"""Get the default KMS key ID, creating one if none exist."""
self._load_keys()
for key in self._keys.values():
if key.enabled:
return key.key_id
default_key = self.create_key(description="Default KMS Key")
return default_key.key_id
def get_provider(self, key_id: str | None = None) -> "KMSEncryptionProvider":
"""Get a KMS encryption provider for the specified key."""
if key_id is None:
key_id = self.get_default_key_id()
key = self.get_key(key_id)
if not key:
raise EncryptionError(f"Key not found: {key_id}")
if not key.enabled:
raise EncryptionError(f"Key is disabled: {key_id}")
return KMSEncryptionProvider(self, key_id)
def enable_key(self, key_id: str) -> None:
"""Enable a key."""
self._load_keys()
key = self._keys.get(key_id)
if not key:
raise EncryptionError(f"Key not found: {key_id}")
key.enabled = True
self._save_keys()
def disable_key(self, key_id: str) -> None:
"""Disable a key."""
self._load_keys()
key = self._keys.get(key_id)
if not key:
raise EncryptionError(f"Key not found: {key_id}")
key.enabled = False
self._save_keys()
def delete_key(self, key_id: str) -> None:
"""Delete a key (schedule for deletion in real KMS)."""
self._load_keys()
if key_id not in self._keys:
raise EncryptionError(f"Key not found: {key_id}")
del self._keys[key_id]
self._save_keys()
def encrypt(self, key_id: str, plaintext: bytes,
context: Dict[str, str] | None = None) -> bytes:
"""Encrypt data directly with a KMS key."""
self._load_keys()
key = self._keys.get(key_id)
if not key:
raise EncryptionError(f"Key not found: {key_id}")
if not key.enabled:
raise EncryptionError(f"Key is disabled: {key_id}")
aesgcm = AESGCM(key.key_material)
nonce = secrets.token_bytes(12)
aad = json.dumps(context, sort_keys=True).encode() if context else None
ciphertext = aesgcm.encrypt(nonce, plaintext, aad)
key_id_bytes = key_id.encode("utf-8")
return len(key_id_bytes).to_bytes(2, "big") + key_id_bytes + nonce + ciphertext
def decrypt(self, ciphertext: bytes,
context: Dict[str, str] | None = None) -> tuple[bytes, str]:
"""Decrypt data directly with a KMS key.
Returns:
Tuple of (plaintext, key_id)
"""
self._load_keys()
key_id_len = int.from_bytes(ciphertext[:2], "big")
key_id = ciphertext[2:2 + key_id_len].decode("utf-8")
rest = ciphertext[2 + key_id_len:]
key = self._keys.get(key_id)
if not key:
raise EncryptionError(f"Key not found: {key_id}")
if not key.enabled:
raise EncryptionError(f"Key is disabled: {key_id}")
nonce = rest[:12]
encrypted = rest[12:]
aesgcm = AESGCM(key.key_material)
aad = json.dumps(context, sort_keys=True).encode() if context else None
try:
plaintext = aesgcm.decrypt(nonce, encrypted, aad)
return plaintext, key_id
except Exception as exc:
logger.debug("KMS decrypt operation failed: %s", exc)
raise EncryptionError("Decryption failed") from exc
def generate_data_key(self, key_id: str,
context: Dict[str, str] | None = None,
key_spec: str = "AES_256") -> tuple[bytes, bytes]:
"""Generate a data key and return both plaintext and encrypted versions.
Args:
key_id: The KMS key ID to use for encryption
context: Optional encryption context
key_spec: Key specification - AES_128 or AES_256 (default)
Returns:
Tuple of (plaintext_key, encrypted_key)
"""
self._load_keys()
key = self._keys.get(key_id)
if not key:
raise EncryptionError(f"Key not found: {key_id}")
if not key.enabled:
raise EncryptionError(f"Key is disabled: {key_id}")
key_bytes = 32 if key_spec == "AES_256" else 16
plaintext_key = secrets.token_bytes(key_bytes)
encrypted_key = self.encrypt(key_id, plaintext_key, context)
return plaintext_key, encrypted_key
def decrypt_data_key(self, key_id: str, encrypted_key: bytes,
context: Dict[str, str] | None = None) -> bytes:
"""Decrypt a data key."""
plaintext, _ = self.decrypt(encrypted_key, context)
return plaintext
def re_encrypt(self, ciphertext: bytes, destination_key_id: str,
source_context: Dict[str, str] | None = None,
destination_context: Dict[str, str] | None = None) -> bytes:
"""Re-encrypt data with a different key."""
plaintext, source_key_id = self.decrypt(ciphertext, source_context)
return self.encrypt(destination_key_id, plaintext, destination_context)
def generate_random(self, num_bytes: int = 32) -> bytes:
"""Generate cryptographically secure random bytes."""
if num_bytes < self.generate_data_key_min_bytes or num_bytes > self.generate_data_key_max_bytes:
raise EncryptionError(
f"Number of bytes must be between {self.generate_data_key_min_bytes} and {self.generate_data_key_max_bytes}"
)
return secrets.token_bytes(num_bytes)

444
app/kms_api.py Normal file
View File

@@ -0,0 +1,444 @@
from __future__ import annotations
import base64
import uuid
from typing import Any, Dict
from flask import Blueprint, Response, current_app, jsonify, request
from .encryption import ClientEncryptionHelper, EncryptionError
from .extensions import limiter
from .iam import IamError
kms_api_bp = Blueprint("kms_api", __name__, url_prefix="/kms")
def _require_principal():
"""Require authentication for KMS operations."""
from .s3_api import _require_principal as s3_require_principal
return s3_require_principal()
def _kms():
"""Get KMS manager from app extensions."""
return current_app.extensions.get("kms")
def _encryption():
"""Get encryption manager from app extensions."""
return current_app.extensions.get("encryption")
def _error_response(code: str, message: str, status: int) -> tuple[Dict[str, Any], int]:
return {"__type": code, "message": message}, status
@kms_api_bp.route("/keys", methods=["GET", "POST"])
@limiter.limit("30 per minute")
def list_or_create_keys():
"""List all KMS keys or create a new key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
if request.method == "POST":
payload = request.get_json(silent=True) or {}
key_id = payload.get("KeyId") or payload.get("key_id")
description = payload.get("Description") or payload.get("description", "")
try:
key = kms.create_key(description=description, key_id=key_id)
current_app.logger.info(
"KMS key created",
extra={"key_id": key.key_id, "principal": principal.access_key},
)
return jsonify({
"KeyMetadata": key.to_dict(),
})
except EncryptionError as exc:
return _error_response("KMSInternalException", str(exc), 400)
keys = kms.list_keys()
return jsonify({
"Keys": [{"KeyId": k.key_id, "KeyArn": k.arn} for k in keys],
"Truncated": False,
})
@kms_api_bp.route("/keys/<key_id>", methods=["GET", "DELETE"])
@limiter.limit("30 per minute")
def get_or_delete_key(key_id: str):
"""Get or delete a specific KMS key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
if request.method == "DELETE":
try:
kms.delete_key(key_id)
current_app.logger.info(
"KMS key deleted",
extra={"key_id": key_id, "principal": principal.access_key},
)
return Response(status=204)
except EncryptionError as exc:
return _error_response("NotFoundException", str(exc), 404)
key = kms.get_key(key_id)
if not key:
return _error_response("NotFoundException", f"Key not found: {key_id}", 404)
return jsonify({"KeyMetadata": key.to_dict()})
@kms_api_bp.route("/keys/<key_id>/enable", methods=["POST"])
@limiter.limit("30 per minute")
def enable_key(key_id: str):
"""Enable a KMS key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
try:
kms.enable_key(key_id)
current_app.logger.info(
"KMS key enabled",
extra={"key_id": key_id, "principal": principal.access_key},
)
return Response(status=200)
except EncryptionError as exc:
return _error_response("NotFoundException", str(exc), 404)
@kms_api_bp.route("/keys/<key_id>/disable", methods=["POST"])
@limiter.limit("30 per minute")
def disable_key(key_id: str):
"""Disable a KMS key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
try:
kms.disable_key(key_id)
current_app.logger.info(
"KMS key disabled",
extra={"key_id": key_id, "principal": principal.access_key},
)
return Response(status=200)
except EncryptionError as exc:
return _error_response("NotFoundException", str(exc), 404)
@kms_api_bp.route("/encrypt", methods=["POST"])
@limiter.limit("60 per minute")
def encrypt_data():
"""Encrypt data using a KMS key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
key_id = payload.get("KeyId")
plaintext_b64 = payload.get("Plaintext")
context = payload.get("EncryptionContext")
if not key_id:
return _error_response("ValidationException", "KeyId is required", 400)
if not plaintext_b64:
return _error_response("ValidationException", "Plaintext is required", 400)
try:
plaintext = base64.b64decode(plaintext_b64)
except Exception:
return _error_response("ValidationException", "Plaintext must be base64 encoded", 400)
try:
ciphertext = kms.encrypt(key_id, plaintext, context)
return jsonify({
"CiphertextBlob": base64.b64encode(ciphertext).decode(),
"KeyId": key_id,
"EncryptionAlgorithm": "SYMMETRIC_DEFAULT",
})
except EncryptionError as exc:
return _error_response("KMSInternalException", str(exc), 400)
@kms_api_bp.route("/decrypt", methods=["POST"])
@limiter.limit("60 per minute")
def decrypt_data():
"""Decrypt data using a KMS key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
ciphertext_b64 = payload.get("CiphertextBlob")
context = payload.get("EncryptionContext")
if not ciphertext_b64:
return _error_response("ValidationException", "CiphertextBlob is required", 400)
try:
ciphertext = base64.b64decode(ciphertext_b64)
except Exception:
return _error_response("ValidationException", "CiphertextBlob must be base64 encoded", 400)
try:
plaintext, key_id = kms.decrypt(ciphertext, context)
return jsonify({
"Plaintext": base64.b64encode(plaintext).decode(),
"KeyId": key_id,
"EncryptionAlgorithm": "SYMMETRIC_DEFAULT",
})
except EncryptionError as exc:
return _error_response("InvalidCiphertextException", str(exc), 400)
@kms_api_bp.route("/generate-data-key", methods=["POST"])
@limiter.limit("60 per minute")
def generate_data_key():
"""Generate a data encryption key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
key_id = payload.get("KeyId")
context = payload.get("EncryptionContext")
key_spec = payload.get("KeySpec", "AES_256")
if not key_id:
return _error_response("ValidationException", "KeyId is required", 400)
if key_spec not in {"AES_256", "AES_128"}:
return _error_response("ValidationException", "KeySpec must be AES_256 or AES_128", 400)
try:
plaintext_key, encrypted_key = kms.generate_data_key(key_id, context)
if key_spec == "AES_128":
plaintext_key = plaintext_key[:16]
return jsonify({
"Plaintext": base64.b64encode(plaintext_key).decode(),
"CiphertextBlob": base64.b64encode(encrypted_key).decode(),
"KeyId": key_id,
})
except EncryptionError as exc:
return _error_response("KMSInternalException", str(exc), 400)
@kms_api_bp.route("/generate-data-key-without-plaintext", methods=["POST"])
@limiter.limit("60 per minute")
def generate_data_key_without_plaintext():
"""Generate a data encryption key without returning the plaintext."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
key_id = payload.get("KeyId")
context = payload.get("EncryptionContext")
if not key_id:
return _error_response("ValidationException", "KeyId is required", 400)
try:
_, encrypted_key = kms.generate_data_key(key_id, context)
return jsonify({
"CiphertextBlob": base64.b64encode(encrypted_key).decode(),
"KeyId": key_id,
})
except EncryptionError as exc:
return _error_response("KMSInternalException", str(exc), 400)
@kms_api_bp.route("/re-encrypt", methods=["POST"])
@limiter.limit("30 per minute")
def re_encrypt():
"""Re-encrypt data with a different key."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
ciphertext_b64 = payload.get("CiphertextBlob")
destination_key_id = payload.get("DestinationKeyId")
source_context = payload.get("SourceEncryptionContext")
destination_context = payload.get("DestinationEncryptionContext")
if not ciphertext_b64:
return _error_response("ValidationException", "CiphertextBlob is required", 400)
if not destination_key_id:
return _error_response("ValidationException", "DestinationKeyId is required", 400)
try:
ciphertext = base64.b64decode(ciphertext_b64)
except Exception:
return _error_response("ValidationException", "CiphertextBlob must be base64 encoded", 400)
try:
plaintext, source_key_id = kms.decrypt(ciphertext, source_context)
new_ciphertext = kms.encrypt(destination_key_id, plaintext, destination_context)
return jsonify({
"CiphertextBlob": base64.b64encode(new_ciphertext).decode(),
"SourceKeyId": source_key_id,
"KeyId": destination_key_id,
})
except EncryptionError as exc:
return _error_response("KMSInternalException", str(exc), 400)
@kms_api_bp.route("/generate-random", methods=["POST"])
@limiter.limit("60 per minute")
def generate_random():
"""Generate random bytes."""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
num_bytes = payload.get("NumberOfBytes", 32)
try:
num_bytes = int(num_bytes)
except (TypeError, ValueError):
return _error_response("ValidationException", "NumberOfBytes must be an integer", 400)
try:
random_bytes = kms.generate_random(num_bytes)
return jsonify({
"Plaintext": base64.b64encode(random_bytes).decode(),
})
except EncryptionError as exc:
return _error_response("ValidationException", str(exc), 400)
@kms_api_bp.route("/client/generate-key", methods=["POST"])
@limiter.limit("30 per minute")
def generate_client_key():
"""Generate a client-side encryption key."""
principal, error = _require_principal()
if error:
return error
key_info = ClientEncryptionHelper.generate_client_key()
return jsonify(key_info)
@kms_api_bp.route("/client/encrypt", methods=["POST"])
@limiter.limit("60 per minute")
def client_encrypt():
"""Encrypt data using client-side encryption."""
principal, error = _require_principal()
if error:
return error
payload = request.get_json(silent=True) or {}
plaintext_b64 = payload.get("Plaintext")
key_b64 = payload.get("Key")
if not plaintext_b64 or not key_b64:
return _error_response("ValidationException", "Plaintext and Key are required", 400)
try:
plaintext = base64.b64decode(plaintext_b64)
result = ClientEncryptionHelper.encrypt_with_key(plaintext, key_b64)
return jsonify(result)
except Exception as exc:
return _error_response("EncryptionError", str(exc), 400)
@kms_api_bp.route("/client/decrypt", methods=["POST"])
@limiter.limit("60 per minute")
def client_decrypt():
"""Decrypt data using client-side encryption."""
principal, error = _require_principal()
if error:
return error
payload = request.get_json(silent=True) or {}
ciphertext_b64 = payload.get("Ciphertext") or payload.get("ciphertext")
nonce_b64 = payload.get("Nonce") or payload.get("nonce")
key_b64 = payload.get("Key") or payload.get("key")
if not ciphertext_b64 or not nonce_b64 or not key_b64:
return _error_response("ValidationException", "Ciphertext, Nonce, and Key are required", 400)
try:
plaintext = ClientEncryptionHelper.decrypt_with_key(ciphertext_b64, nonce_b64, key_b64)
return jsonify({
"Plaintext": base64.b64encode(plaintext).decode(),
})
except Exception as exc:
return _error_response("DecryptionError", str(exc), 400)
@kms_api_bp.route("/materials/<key_id>", methods=["POST"])
@limiter.limit("60 per minute")
def get_encryption_materials(key_id: str):
"""Get encryption materials for client-side S3 encryption.
This is used by S3 encryption clients that want to use KMS for
key management but perform encryption client-side.
"""
principal, error = _require_principal()
if error:
return error
kms = _kms()
if not kms:
return _error_response("KMSNotEnabled", "KMS is not configured", 400)
payload = request.get_json(silent=True) or {}
context = payload.get("EncryptionContext")
try:
plaintext_key, encrypted_key = kms.generate_data_key(key_id, context)
return jsonify({
"PlaintextKey": base64.b64encode(plaintext_key).decode(),
"EncryptedKey": base64.b64encode(encrypted_key).decode(),
"KeyId": key_id,
"Algorithm": "AES-256-GCM",
"KeyWrapAlgorithm": "kms",
})
except EncryptionError as exc:
return _error_response("KMSInternalException", str(exc), 400)

340
app/lifecycle.py Normal file
View File

@@ -0,0 +1,340 @@
from __future__ import annotations
import json
import logging
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from .storage import ObjectStorage, StorageError
logger = logging.getLogger(__name__)
@dataclass
class LifecycleResult:
bucket_name: str
objects_deleted: int = 0
versions_deleted: int = 0
uploads_aborted: int = 0
errors: List[str] = field(default_factory=list)
execution_time_seconds: float = 0.0
@dataclass
class LifecycleExecutionRecord:
timestamp: float
bucket_name: str
objects_deleted: int
versions_deleted: int
uploads_aborted: int
errors: List[str]
execution_time_seconds: float
def to_dict(self) -> dict:
return {
"timestamp": self.timestamp,
"bucket_name": self.bucket_name,
"objects_deleted": self.objects_deleted,
"versions_deleted": self.versions_deleted,
"uploads_aborted": self.uploads_aborted,
"errors": self.errors,
"execution_time_seconds": self.execution_time_seconds,
}
@classmethod
def from_dict(cls, data: dict) -> "LifecycleExecutionRecord":
return cls(
timestamp=data["timestamp"],
bucket_name=data["bucket_name"],
objects_deleted=data["objects_deleted"],
versions_deleted=data["versions_deleted"],
uploads_aborted=data["uploads_aborted"],
errors=data.get("errors", []),
execution_time_seconds=data["execution_time_seconds"],
)
@classmethod
def from_result(cls, result: LifecycleResult) -> "LifecycleExecutionRecord":
return cls(
timestamp=time.time(),
bucket_name=result.bucket_name,
objects_deleted=result.objects_deleted,
versions_deleted=result.versions_deleted,
uploads_aborted=result.uploads_aborted,
errors=result.errors.copy(),
execution_time_seconds=result.execution_time_seconds,
)
class LifecycleHistoryStore:
def __init__(self, storage_root: Path, max_history_per_bucket: int = 50) -> None:
self.storage_root = storage_root
self.max_history_per_bucket = max_history_per_bucket
self._lock = threading.Lock()
def _get_history_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "lifecycle_history.json"
def load_history(self, bucket_name: str) -> List[LifecycleExecutionRecord]:
path = self._get_history_path(bucket_name)
if not path.exists():
return []
try:
with open(path, "r") as f:
data = json.load(f)
return [LifecycleExecutionRecord.from_dict(d) for d in data.get("executions", [])]
except (OSError, ValueError, KeyError) as e:
logger.error(f"Failed to load lifecycle history for {bucket_name}: {e}")
return []
def save_history(self, bucket_name: str, records: List[LifecycleExecutionRecord]) -> None:
path = self._get_history_path(bucket_name)
path.parent.mkdir(parents=True, exist_ok=True)
data = {"executions": [r.to_dict() for r in records[:self.max_history_per_bucket]]}
try:
with open(path, "w") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error(f"Failed to save lifecycle history for {bucket_name}: {e}")
def add_record(self, bucket_name: str, record: LifecycleExecutionRecord) -> None:
with self._lock:
records = self.load_history(bucket_name)
records.insert(0, record)
self.save_history(bucket_name, records)
def get_history(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[LifecycleExecutionRecord]:
records = self.load_history(bucket_name)
return records[offset:offset + limit]
class LifecycleManager:
def __init__(
self,
storage: ObjectStorage,
interval_seconds: int = 3600,
storage_root: Optional[Path] = None,
max_history_per_bucket: int = 50,
):
self.storage = storage
self.interval_seconds = interval_seconds
self.storage_root = storage_root
self._timer: Optional[threading.Timer] = None
self._shutdown = False
self._lock = threading.Lock()
self.history_store = LifecycleHistoryStore(storage_root, max_history_per_bucket) if storage_root else None
def start(self) -> None:
if self._timer is not None:
return
self._shutdown = False
self._schedule_next()
logger.info(f"Lifecycle manager started with interval {self.interval_seconds}s")
def stop(self) -> None:
self._shutdown = True
if self._timer:
self._timer.cancel()
self._timer = None
logger.info("Lifecycle manager stopped")
def _schedule_next(self) -> None:
if self._shutdown:
return
self._timer = threading.Timer(self.interval_seconds, self._run_enforcement)
self._timer.daemon = True
self._timer.start()
def _run_enforcement(self) -> None:
if self._shutdown:
return
try:
self.enforce_all_buckets()
except Exception as e:
logger.error(f"Lifecycle enforcement failed: {e}")
finally:
self._schedule_next()
def enforce_all_buckets(self) -> Dict[str, LifecycleResult]:
results = {}
try:
buckets = self.storage.list_buckets()
for bucket in buckets:
result = self.enforce_rules(bucket.name)
if result.objects_deleted > 0 or result.versions_deleted > 0 or result.uploads_aborted > 0:
results[bucket.name] = result
except StorageError as e:
logger.error(f"Failed to list buckets for lifecycle: {e}")
return results
def enforce_rules(self, bucket_name: str) -> LifecycleResult:
start_time = time.time()
result = LifecycleResult(bucket_name=bucket_name)
try:
lifecycle = self.storage.get_bucket_lifecycle(bucket_name)
if not lifecycle:
return result
for rule in lifecycle:
if rule.get("Status") != "Enabled":
continue
rule_id = rule.get("ID", "unknown")
prefix = rule.get("Prefix", rule.get("Filter", {}).get("Prefix", ""))
self._enforce_expiration(bucket_name, rule, prefix, result)
self._enforce_noncurrent_expiration(bucket_name, rule, prefix, result)
self._enforce_abort_multipart(bucket_name, rule, result)
except StorageError as e:
result.errors.append(str(e))
logger.error(f"Lifecycle enforcement error for {bucket_name}: {e}")
result.execution_time_seconds = time.time() - start_time
if result.objects_deleted > 0 or result.versions_deleted > 0 or result.uploads_aborted > 0 or result.errors:
logger.info(
f"Lifecycle enforcement for {bucket_name}: "
f"deleted={result.objects_deleted}, versions={result.versions_deleted}, "
f"aborted={result.uploads_aborted}, time={result.execution_time_seconds:.2f}s"
)
if self.history_store:
record = LifecycleExecutionRecord.from_result(result)
self.history_store.add_record(bucket_name, record)
return result
def _enforce_expiration(
self, bucket_name: str, rule: Dict[str, Any], prefix: str, result: LifecycleResult
) -> None:
expiration = rule.get("Expiration", {})
if not expiration:
return
days = expiration.get("Days")
date_str = expiration.get("Date")
if days:
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
elif date_str:
try:
cutoff = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
except ValueError:
return
else:
return
try:
objects = self.storage.list_objects_all(bucket_name)
for obj in objects:
if prefix and not obj.key.startswith(prefix):
continue
if obj.last_modified < cutoff:
try:
self.storage.delete_object(bucket_name, obj.key)
result.objects_deleted += 1
except StorageError as e:
result.errors.append(f"Failed to delete {obj.key}: {e}")
except StorageError as e:
result.errors.append(f"Failed to list objects: {e}")
def _enforce_noncurrent_expiration(
self, bucket_name: str, rule: Dict[str, Any], prefix: str, result: LifecycleResult
) -> None:
noncurrent = rule.get("NoncurrentVersionExpiration", {})
noncurrent_days = noncurrent.get("NoncurrentDays")
if not noncurrent_days:
return
cutoff = datetime.now(timezone.utc) - timedelta(days=noncurrent_days)
try:
objects = self.storage.list_objects_all(bucket_name)
for obj in objects:
if prefix and not obj.key.startswith(prefix):
continue
try:
versions = self.storage.list_object_versions(bucket_name, obj.key)
for version in versions:
archived_at_str = version.get("archived_at", "")
if not archived_at_str:
continue
try:
archived_at = datetime.fromisoformat(archived_at_str.replace("Z", "+00:00"))
if archived_at < cutoff:
version_id = version.get("version_id")
if version_id:
self.storage.delete_object_version(bucket_name, obj.key, version_id)
result.versions_deleted += 1
except (ValueError, StorageError) as e:
result.errors.append(f"Failed to process version: {e}")
except StorageError:
pass
except StorageError as e:
result.errors.append(f"Failed to list objects: {e}")
try:
orphaned = self.storage.list_orphaned_objects(bucket_name)
for item in orphaned:
obj_key = item.get("key", "")
if prefix and not obj_key.startswith(prefix):
continue
try:
versions = self.storage.list_object_versions(bucket_name, obj_key)
for version in versions:
archived_at_str = version.get("archived_at", "")
if not archived_at_str:
continue
try:
archived_at = datetime.fromisoformat(archived_at_str.replace("Z", "+00:00"))
if archived_at < cutoff:
version_id = version.get("version_id")
if version_id:
self.storage.delete_object_version(bucket_name, obj_key, version_id)
result.versions_deleted += 1
except (ValueError, StorageError) as e:
result.errors.append(f"Failed to process orphaned version: {e}")
except StorageError:
pass
except StorageError as e:
result.errors.append(f"Failed to list orphaned objects: {e}")
def _enforce_abort_multipart(
self, bucket_name: str, rule: Dict[str, Any], result: LifecycleResult
) -> None:
abort_config = rule.get("AbortIncompleteMultipartUpload", {})
days_after = abort_config.get("DaysAfterInitiation")
if not days_after:
return
cutoff = datetime.now(timezone.utc) - timedelta(days=days_after)
try:
uploads = self.storage.list_multipart_uploads(bucket_name)
for upload in uploads:
created_at_str = upload.get("created_at", "")
if not created_at_str:
continue
try:
created_at = datetime.fromisoformat(created_at_str.replace("Z", "+00:00"))
if created_at < cutoff:
upload_id = upload.get("upload_id")
if upload_id:
self.storage.abort_multipart_upload(bucket_name, upload_id)
result.uploads_aborted += 1
except (ValueError, StorageError) as e:
result.errors.append(f"Failed to abort upload: {e}")
except StorageError as e:
result.errors.append(f"Failed to list multipart uploads: {e}")
def run_now(self, bucket_name: Optional[str] = None) -> Dict[str, LifecycleResult]:
if bucket_name:
return {bucket_name: self.enforce_rules(bucket_name)}
return self.enforce_all_buckets()
def get_execution_history(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[LifecycleExecutionRecord]:
if not self.history_store:
return []
return self.history_store.get_history(bucket_name, limit, offset)

406
app/notifications.py Normal file
View File

@@ -0,0 +1,406 @@
from __future__ import annotations
import ipaddress
import json
import logging
import queue
import socket
import threading
import time
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
import requests
from urllib3.util.connection import create_connection as _urllib3_create_connection
def _resolve_and_check_url(url: str, allow_internal: bool = False) -> Optional[str]:
try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return None
cloud_metadata_hosts = {
"metadata.google.internal",
"169.254.169.254",
}
if hostname.lower() in cloud_metadata_hosts:
return None
if allow_internal:
return hostname
blocked_hosts = {
"localhost",
"127.0.0.1",
"0.0.0.0",
"::1",
"[::1]",
}
if hostname.lower() in blocked_hosts:
return None
try:
resolved_ip = socket.gethostbyname(hostname)
ip = ipaddress.ip_address(resolved_ip)
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
return None
return resolved_ip
except (socket.gaierror, ValueError):
return None
except Exception:
return None
def _is_safe_url(url: str, allow_internal: bool = False) -> bool:
return _resolve_and_check_url(url, allow_internal) is not None
_dns_pin_lock = threading.Lock()
def _pinned_post(url: str, pinned_ip: str, **kwargs: Any) -> requests.Response:
parsed = urlparse(url)
hostname = parsed.hostname or ""
session = requests.Session()
original_create = _urllib3_create_connection
def _create_pinned(address: Any, *args: Any, **kw: Any) -> Any:
host, req_port = address
if host == hostname:
return original_create((pinned_ip, req_port), *args, **kw)
return original_create(address, *args, **kw)
import urllib3.util.connection as _conn_mod
with _dns_pin_lock:
_conn_mod.create_connection = _create_pinned
try:
return session.post(url, **kwargs)
finally:
_conn_mod.create_connection = original_create
logger = logging.getLogger(__name__)
@dataclass
class NotificationEvent:
event_name: str
bucket_name: str
object_key: str
object_size: int = 0
etag: str = ""
version_id: Optional[str] = None
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
request_id: str = field(default_factory=lambda: uuid.uuid4().hex)
source_ip: str = ""
user_identity: str = ""
def to_s3_event(self) -> Dict[str, Any]:
return {
"Records": [
{
"eventVersion": "2.1",
"eventSource": "myfsio:s3",
"awsRegion": "local",
"eventTime": self.timestamp.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
"eventName": self.event_name,
"userIdentity": {
"principalId": self.user_identity or "ANONYMOUS",
},
"requestParameters": {
"sourceIPAddress": self.source_ip or "127.0.0.1",
},
"responseElements": {
"x-amz-request-id": self.request_id,
"x-amz-id-2": self.request_id,
},
"s3": {
"s3SchemaVersion": "1.0",
"configurationId": "notification",
"bucket": {
"name": self.bucket_name,
"ownerIdentity": {"principalId": "local"},
"arn": f"arn:aws:s3:::{self.bucket_name}",
},
"object": {
"key": self.object_key,
"size": self.object_size,
"eTag": self.etag,
"versionId": self.version_id or "null",
"sequencer": f"{int(time.time() * 1000):016X}",
},
},
}
]
}
@dataclass
class WebhookDestination:
url: str
headers: Dict[str, str] = field(default_factory=dict)
timeout_seconds: int = 30
retry_count: int = 3
retry_delay_seconds: int = 1
def to_dict(self) -> Dict[str, Any]:
return {
"url": self.url,
"headers": self.headers,
"timeout_seconds": self.timeout_seconds,
"retry_count": self.retry_count,
"retry_delay_seconds": self.retry_delay_seconds,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "WebhookDestination":
return cls(
url=data.get("url", ""),
headers=data.get("headers", {}),
timeout_seconds=data.get("timeout_seconds", 30),
retry_count=data.get("retry_count", 3),
retry_delay_seconds=data.get("retry_delay_seconds", 1),
)
@dataclass
class NotificationConfiguration:
id: str
events: List[str]
destination: WebhookDestination
prefix_filter: str = ""
suffix_filter: str = ""
def matches_event(self, event_name: str, object_key: str) -> bool:
event_match = False
for pattern in self.events:
if pattern.endswith("*"):
base = pattern[:-1]
if event_name.startswith(base):
event_match = True
break
elif pattern == event_name:
event_match = True
break
if not event_match:
return False
if self.prefix_filter and not object_key.startswith(self.prefix_filter):
return False
if self.suffix_filter and not object_key.endswith(self.suffix_filter):
return False
return True
def to_dict(self) -> Dict[str, Any]:
return {
"Id": self.id,
"Events": self.events,
"Destination": self.destination.to_dict(),
"Filter": {
"Key": {
"FilterRules": [
{"Name": "prefix", "Value": self.prefix_filter},
{"Name": "suffix", "Value": self.suffix_filter},
]
}
},
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NotificationConfiguration":
prefix = ""
suffix = ""
filter_data = data.get("Filter", {})
key_filter = filter_data.get("Key", {})
for rule in key_filter.get("FilterRules", []):
if rule.get("Name") == "prefix":
prefix = rule.get("Value", "")
elif rule.get("Name") == "suffix":
suffix = rule.get("Value", "")
return cls(
id=data.get("Id", uuid.uuid4().hex),
events=data.get("Events", []),
destination=WebhookDestination.from_dict(data.get("Destination", {})),
prefix_filter=prefix,
suffix_filter=suffix,
)
class NotificationService:
def __init__(self, storage_root: Path, worker_count: int = 2, allow_internal_endpoints: bool = False):
self.storage_root = storage_root
self._allow_internal_endpoints = allow_internal_endpoints
self._configs: Dict[str, List[NotificationConfiguration]] = {}
self._queue: queue.Queue[tuple[NotificationEvent, WebhookDestination]] = queue.Queue()
self._workers: List[threading.Thread] = []
self._shutdown = threading.Event()
self._stats = {
"events_queued": 0,
"events_sent": 0,
"events_failed": 0,
}
for i in range(worker_count):
worker = threading.Thread(target=self._worker_loop, name=f"notification-worker-{i}", daemon=True)
worker.start()
self._workers.append(worker)
def _config_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "notifications.json"
def get_bucket_notifications(self, bucket_name: str) -> List[NotificationConfiguration]:
if bucket_name in self._configs:
return self._configs[bucket_name]
config_path = self._config_path(bucket_name)
if not config_path.exists():
return []
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
configs = [NotificationConfiguration.from_dict(c) for c in data.get("configurations", [])]
self._configs[bucket_name] = configs
return configs
except (json.JSONDecodeError, OSError) as e:
logger.warning(f"Failed to load notification config for {bucket_name}: {e}")
return []
def set_bucket_notifications(
self, bucket_name: str, configurations: List[NotificationConfiguration]
) -> None:
config_path = self._config_path(bucket_name)
config_path.parent.mkdir(parents=True, exist_ok=True)
data = {"configurations": [c.to_dict() for c in configurations]}
config_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
self._configs[bucket_name] = configurations
def delete_bucket_notifications(self, bucket_name: str) -> None:
config_path = self._config_path(bucket_name)
try:
if config_path.exists():
config_path.unlink()
except OSError:
pass
self._configs.pop(bucket_name, None)
def emit_event(self, event: NotificationEvent) -> None:
configurations = self.get_bucket_notifications(event.bucket_name)
if not configurations:
return
for config in configurations:
if config.matches_event(event.event_name, event.object_key):
self._queue.put((event, config.destination))
self._stats["events_queued"] += 1
logger.debug(
f"Queued notification for {event.event_name} on {event.bucket_name}/{event.object_key}"
)
def emit_object_created(
self,
bucket_name: str,
object_key: str,
*,
size: int = 0,
etag: str = "",
version_id: Optional[str] = None,
request_id: str = "",
source_ip: str = "",
user_identity: str = "",
operation: str = "Put",
) -> None:
event = NotificationEvent(
event_name=f"s3:ObjectCreated:{operation}",
bucket_name=bucket_name,
object_key=object_key,
object_size=size,
etag=etag,
version_id=version_id,
request_id=request_id or uuid.uuid4().hex,
source_ip=source_ip,
user_identity=user_identity,
)
self.emit_event(event)
def emit_object_removed(
self,
bucket_name: str,
object_key: str,
*,
version_id: Optional[str] = None,
request_id: str = "",
source_ip: str = "",
user_identity: str = "",
operation: str = "Delete",
) -> None:
event = NotificationEvent(
event_name=f"s3:ObjectRemoved:{operation}",
bucket_name=bucket_name,
object_key=object_key,
version_id=version_id,
request_id=request_id or uuid.uuid4().hex,
source_ip=source_ip,
user_identity=user_identity,
)
self.emit_event(event)
def _worker_loop(self) -> None:
while not self._shutdown.is_set():
try:
event, destination = self._queue.get(timeout=1.0)
except queue.Empty:
continue
try:
self._send_notification(event, destination)
self._stats["events_sent"] += 1
except Exception as e:
self._stats["events_failed"] += 1
logger.error(f"Failed to send notification: {e}")
finally:
self._queue.task_done()
def _send_notification(self, event: NotificationEvent, destination: WebhookDestination) -> None:
resolved_ip = _resolve_and_check_url(destination.url, allow_internal=self._allow_internal_endpoints)
if not resolved_ip:
raise RuntimeError(f"Blocked request (SSRF protection): {destination.url}")
payload = event.to_s3_event()
headers = {"Content-Type": "application/json", **destination.headers}
last_error = None
for attempt in range(destination.retry_count):
try:
response = _pinned_post(
destination.url,
resolved_ip,
json=payload,
headers=headers,
timeout=destination.timeout_seconds,
)
if response.status_code < 400:
logger.info(
f"Notification sent: {event.event_name} -> {destination.url} (status={response.status_code})"
)
return
last_error = f"HTTP {response.status_code}: {response.text[:200]}"
except requests.RequestException as e:
last_error = str(e)
if attempt < destination.retry_count - 1:
time.sleep(destination.retry_delay_seconds * (attempt + 1))
raise RuntimeError(f"Failed after {destination.retry_count} attempts: {last_error}")
def get_stats(self) -> Dict[str, int]:
return dict(self._stats)
def shutdown(self) -> None:
self._shutdown.set()
for worker in self._workers:
worker.join(timeout=5.0)

234
app/object_lock.py Normal file
View File

@@ -0,0 +1,234 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Any, Dict, Optional
class RetentionMode(Enum):
GOVERNANCE = "GOVERNANCE"
COMPLIANCE = "COMPLIANCE"
class ObjectLockError(Exception):
pass
@dataclass
class ObjectLockRetention:
mode: RetentionMode
retain_until_date: datetime
def to_dict(self) -> Dict[str, str]:
return {
"Mode": self.mode.value,
"RetainUntilDate": self.retain_until_date.isoformat(),
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> Optional["ObjectLockRetention"]:
if not data:
return None
mode_str = data.get("Mode")
date_str = data.get("RetainUntilDate")
if not mode_str or not date_str:
return None
try:
mode = RetentionMode(mode_str)
retain_until = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
return cls(mode=mode, retain_until_date=retain_until)
except (ValueError, KeyError):
return None
def is_expired(self) -> bool:
return datetime.now(timezone.utc) > self.retain_until_date
@dataclass
class ObjectLockConfig:
enabled: bool = False
default_retention: Optional[ObjectLockRetention] = None
def to_dict(self) -> Dict[str, Any]:
result: Dict[str, Any] = {"ObjectLockEnabled": "Enabled" if self.enabled else "Disabled"}
if self.default_retention:
result["Rule"] = {
"DefaultRetention": {
"Mode": self.default_retention.mode.value,
"Days": None,
"Years": None,
}
}
return result
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ObjectLockConfig":
enabled = data.get("ObjectLockEnabled") == "Enabled"
default_retention = None
rule = data.get("Rule")
if rule and "DefaultRetention" in rule:
dr = rule["DefaultRetention"]
mode_str = dr.get("Mode", "GOVERNANCE")
days = dr.get("Days")
years = dr.get("Years")
if days or years:
from datetime import timedelta
now = datetime.now(timezone.utc)
if years:
delta = timedelta(days=int(years) * 365)
else:
delta = timedelta(days=int(days))
default_retention = ObjectLockRetention(
mode=RetentionMode(mode_str),
retain_until_date=now + delta,
)
return cls(enabled=enabled, default_retention=default_retention)
class ObjectLockService:
def __init__(self, storage_root: Path):
self.storage_root = storage_root
self._config_cache: Dict[str, ObjectLockConfig] = {}
def _bucket_lock_config_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "object_lock.json"
def _object_lock_meta_path(self, bucket_name: str, object_key: str) -> Path:
safe_key = object_key.replace("/", "_").replace("\\", "_")
return (
self.storage_root / ".myfsio.sys" / "buckets" / bucket_name /
"locks" / f"{safe_key}.lock.json"
)
def get_bucket_lock_config(self, bucket_name: str) -> ObjectLockConfig:
if bucket_name in self._config_cache:
return self._config_cache[bucket_name]
config_path = self._bucket_lock_config_path(bucket_name)
if not config_path.exists():
return ObjectLockConfig(enabled=False)
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
config = ObjectLockConfig.from_dict(data)
self._config_cache[bucket_name] = config
return config
except (json.JSONDecodeError, OSError):
return ObjectLockConfig(enabled=False)
def set_bucket_lock_config(self, bucket_name: str, config: ObjectLockConfig) -> None:
config_path = self._bucket_lock_config_path(bucket_name)
config_path.parent.mkdir(parents=True, exist_ok=True)
config_path.write_text(json.dumps(config.to_dict()), encoding="utf-8")
self._config_cache[bucket_name] = config
def enable_bucket_lock(self, bucket_name: str) -> None:
config = self.get_bucket_lock_config(bucket_name)
config.enabled = True
self.set_bucket_lock_config(bucket_name, config)
def is_bucket_lock_enabled(self, bucket_name: str) -> bool:
return self.get_bucket_lock_config(bucket_name).enabled
def get_object_retention(self, bucket_name: str, object_key: str) -> Optional[ObjectLockRetention]:
meta_path = self._object_lock_meta_path(bucket_name, object_key)
if not meta_path.exists():
return None
try:
data = json.loads(meta_path.read_text(encoding="utf-8"))
return ObjectLockRetention.from_dict(data.get("retention", {}))
except (json.JSONDecodeError, OSError):
return None
def set_object_retention(
self,
bucket_name: str,
object_key: str,
retention: ObjectLockRetention,
bypass_governance: bool = False,
) -> None:
existing = self.get_object_retention(bucket_name, object_key)
if existing and not existing.is_expired():
if existing.mode == RetentionMode.COMPLIANCE:
raise ObjectLockError(
"Cannot modify retention on object with COMPLIANCE mode until retention expires"
)
if existing.mode == RetentionMode.GOVERNANCE and not bypass_governance:
raise ObjectLockError(
"Cannot modify GOVERNANCE retention without bypass-governance permission"
)
meta_path = self._object_lock_meta_path(bucket_name, object_key)
meta_path.parent.mkdir(parents=True, exist_ok=True)
existing_data: Dict[str, Any] = {}
if meta_path.exists():
try:
existing_data = json.loads(meta_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
pass
existing_data["retention"] = retention.to_dict()
meta_path.write_text(json.dumps(existing_data), encoding="utf-8")
def get_legal_hold(self, bucket_name: str, object_key: str) -> bool:
meta_path = self._object_lock_meta_path(bucket_name, object_key)
if not meta_path.exists():
return False
try:
data = json.loads(meta_path.read_text(encoding="utf-8"))
return data.get("legal_hold", False)
except (json.JSONDecodeError, OSError):
return False
def set_legal_hold(self, bucket_name: str, object_key: str, enabled: bool) -> None:
meta_path = self._object_lock_meta_path(bucket_name, object_key)
meta_path.parent.mkdir(parents=True, exist_ok=True)
existing_data: Dict[str, Any] = {}
if meta_path.exists():
try:
existing_data = json.loads(meta_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
pass
existing_data["legal_hold"] = enabled
meta_path.write_text(json.dumps(existing_data), encoding="utf-8")
def can_delete_object(
self,
bucket_name: str,
object_key: str,
bypass_governance: bool = False,
) -> tuple[bool, str]:
if self.get_legal_hold(bucket_name, object_key):
return False, "Object is under legal hold"
retention = self.get_object_retention(bucket_name, object_key)
if retention and not retention.is_expired():
if retention.mode == RetentionMode.COMPLIANCE:
return False, f"Object is locked in COMPLIANCE mode until {retention.retain_until_date.isoformat()}"
if retention.mode == RetentionMode.GOVERNANCE:
if not bypass_governance:
return False, f"Object is locked in GOVERNANCE mode until {retention.retain_until_date.isoformat()}"
return True, ""
def can_overwrite_object(
self,
bucket_name: str,
object_key: str,
bypass_governance: bool = False,
) -> tuple[bool, str]:
return self.can_delete_object(bucket_name, object_key, bypass_governance)
def delete_object_lock_metadata(self, bucket_name: str, object_key: str) -> None:
meta_path = self._object_lock_meta_path(bucket_name, object_key)
try:
if meta_path.exists():
meta_path.unlink()
except OSError:
pass

296
app/operation_metrics.py Normal file
View File

@@ -0,0 +1,296 @@
from __future__ import annotations
import json
import logging
import random
import threading
import time
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
MAX_LATENCY_SAMPLES = 5000
logger = logging.getLogger(__name__)
@dataclass
class OperationStats:
count: int = 0
success_count: int = 0
error_count: int = 0
latency_sum_ms: float = 0.0
latency_min_ms: float = float("inf")
latency_max_ms: float = 0.0
bytes_in: int = 0
bytes_out: int = 0
latency_samples: List[float] = field(default_factory=list)
@staticmethod
def _compute_percentile(sorted_data: List[float], p: float) -> float:
if not sorted_data:
return 0.0
k = (len(sorted_data) - 1) * (p / 100.0)
f = int(k)
c = min(f + 1, len(sorted_data) - 1)
d = k - f
return sorted_data[f] + d * (sorted_data[c] - sorted_data[f])
def record(self, latency_ms: float, success: bool, bytes_in: int = 0, bytes_out: int = 0) -> None:
self.count += 1
if success:
self.success_count += 1
else:
self.error_count += 1
self.latency_sum_ms += latency_ms
if latency_ms < self.latency_min_ms:
self.latency_min_ms = latency_ms
if latency_ms > self.latency_max_ms:
self.latency_max_ms = latency_ms
self.bytes_in += bytes_in
self.bytes_out += bytes_out
if len(self.latency_samples) < MAX_LATENCY_SAMPLES:
self.latency_samples.append(latency_ms)
else:
j = random.randint(0, self.count - 1)
if j < MAX_LATENCY_SAMPLES:
self.latency_samples[j] = latency_ms
def to_dict(self) -> Dict[str, Any]:
avg_latency = self.latency_sum_ms / self.count if self.count > 0 else 0.0
min_latency = self.latency_min_ms if self.latency_min_ms != float("inf") else 0.0
sorted_latencies = sorted(self.latency_samples)
return {
"count": self.count,
"success_count": self.success_count,
"error_count": self.error_count,
"latency_avg_ms": round(avg_latency, 2),
"latency_min_ms": round(min_latency, 2),
"latency_max_ms": round(self.latency_max_ms, 2),
"latency_p50_ms": round(self._compute_percentile(sorted_latencies, 50), 2),
"latency_p95_ms": round(self._compute_percentile(sorted_latencies, 95), 2),
"latency_p99_ms": round(self._compute_percentile(sorted_latencies, 99), 2),
"bytes_in": self.bytes_in,
"bytes_out": self.bytes_out,
}
def merge(self, other: "OperationStats") -> None:
self.count += other.count
self.success_count += other.success_count
self.error_count += other.error_count
self.latency_sum_ms += other.latency_sum_ms
if other.latency_min_ms < self.latency_min_ms:
self.latency_min_ms = other.latency_min_ms
if other.latency_max_ms > self.latency_max_ms:
self.latency_max_ms = other.latency_max_ms
self.bytes_in += other.bytes_in
self.bytes_out += other.bytes_out
combined = self.latency_samples + other.latency_samples
if len(combined) > MAX_LATENCY_SAMPLES:
random.shuffle(combined)
combined = combined[:MAX_LATENCY_SAMPLES]
self.latency_samples = combined
@dataclass
class MetricsSnapshot:
timestamp: datetime
window_seconds: int
by_method: Dict[str, Dict[str, Any]]
by_endpoint: Dict[str, Dict[str, Any]]
by_status_class: Dict[str, int]
error_codes: Dict[str, int]
totals: Dict[str, Any]
def to_dict(self) -> Dict[str, Any]:
return {
"timestamp": self.timestamp.isoformat(),
"window_seconds": self.window_seconds,
"by_method": self.by_method,
"by_endpoint": self.by_endpoint,
"by_status_class": self.by_status_class,
"error_codes": self.error_codes,
"totals": self.totals,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MetricsSnapshot":
return cls(
timestamp=datetime.fromisoformat(data["timestamp"]),
window_seconds=data.get("window_seconds", 300),
by_method=data.get("by_method", {}),
by_endpoint=data.get("by_endpoint", {}),
by_status_class=data.get("by_status_class", {}),
error_codes=data.get("error_codes", {}),
totals=data.get("totals", {}),
)
class OperationMetricsCollector:
def __init__(
self,
storage_root: Path,
interval_minutes: int = 5,
retention_hours: int = 24,
):
self.storage_root = storage_root
self.interval_seconds = interval_minutes * 60
self.retention_hours = retention_hours
self._lock = threading.Lock()
self._by_method: Dict[str, OperationStats] = defaultdict(OperationStats)
self._by_endpoint: Dict[str, OperationStats] = defaultdict(OperationStats)
self._by_status_class: Dict[str, int] = {}
self._error_codes: Dict[str, int] = {}
self._totals = OperationStats()
self._window_start = time.time()
self._shutdown = threading.Event()
self._snapshots: List[MetricsSnapshot] = []
self._load_history()
self._snapshot_thread = threading.Thread(
target=self._snapshot_loop, name="operation-metrics-snapshot", daemon=True
)
self._snapshot_thread.start()
def _config_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "operation_metrics.json"
def _load_history(self) -> None:
config_path = self._config_path()
if not config_path.exists():
return
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
snapshots_data = data.get("snapshots", [])
self._snapshots = [MetricsSnapshot.from_dict(s) for s in snapshots_data]
self._prune_old_snapshots()
except (json.JSONDecodeError, OSError, KeyError) as e:
logger.warning(f"Failed to load operation metrics history: {e}")
def _save_history(self) -> None:
config_path = self._config_path()
config_path.parent.mkdir(parents=True, exist_ok=True)
try:
data = {"snapshots": [s.to_dict() for s in self._snapshots]}
config_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
except OSError as e:
logger.warning(f"Failed to save operation metrics history: {e}")
def _prune_old_snapshots(self) -> None:
if not self._snapshots:
return
cutoff = datetime.now(timezone.utc).timestamp() - (self.retention_hours * 3600)
self._snapshots = [
s for s in self._snapshots if s.timestamp.timestamp() > cutoff
]
def _snapshot_loop(self) -> None:
while not self._shutdown.is_set():
self._shutdown.wait(timeout=self.interval_seconds)
if not self._shutdown.is_set():
self._take_snapshot()
def _take_snapshot(self) -> None:
with self._lock:
now = datetime.now(timezone.utc)
window_seconds = int(time.time() - self._window_start)
snapshot = MetricsSnapshot(
timestamp=now,
window_seconds=window_seconds,
by_method={k: v.to_dict() for k, v in self._by_method.items()},
by_endpoint={k: v.to_dict() for k, v in self._by_endpoint.items()},
by_status_class=dict(self._by_status_class),
error_codes=dict(self._error_codes),
totals=self._totals.to_dict(),
)
self._snapshots.append(snapshot)
self._prune_old_snapshots()
self._save_history()
self._by_method = defaultdict(OperationStats)
self._by_endpoint = defaultdict(OperationStats)
self._by_status_class.clear()
self._error_codes.clear()
self._totals = OperationStats()
self._window_start = time.time()
def record_request(
self,
method: str,
endpoint_type: str,
status_code: int,
latency_ms: float,
bytes_in: int = 0,
bytes_out: int = 0,
error_code: Optional[str] = None,
) -> None:
success = 200 <= status_code < 400
status_class = f"{status_code // 100}xx"
with self._lock:
self._by_method[method].record(latency_ms, success, bytes_in, bytes_out)
self._by_endpoint[endpoint_type].record(latency_ms, success, bytes_in, bytes_out)
self._by_status_class[status_class] = self._by_status_class.get(status_class, 0) + 1
if error_code:
self._error_codes[error_code] = self._error_codes.get(error_code, 0) + 1
self._totals.record(latency_ms, success, bytes_in, bytes_out)
def get_current_stats(self) -> Dict[str, Any]:
with self._lock:
window_seconds = int(time.time() - self._window_start)
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"window_seconds": window_seconds,
"by_method": {k: v.to_dict() for k, v in self._by_method.items()},
"by_endpoint": {k: v.to_dict() for k, v in self._by_endpoint.items()},
"by_status_class": dict(self._by_status_class),
"error_codes": dict(self._error_codes),
"totals": self._totals.to_dict(),
}
def get_history(self, hours: Optional[int] = None) -> List[Dict[str, Any]]:
with self._lock:
snapshots = list(self._snapshots)
if hours:
cutoff = datetime.now(timezone.utc).timestamp() - (hours * 3600)
snapshots = [s for s in snapshots if s.timestamp.timestamp() > cutoff]
return [s.to_dict() for s in snapshots]
def shutdown(self) -> None:
self._shutdown.set()
self._take_snapshot()
self._snapshot_thread.join(timeout=5.0)
def classify_endpoint(path: str) -> str:
if not path or path == "/":
return "service"
path = path.rstrip("/")
if path.startswith("/ui"):
return "ui"
if path.startswith("/kms"):
return "kms"
if path.startswith("/myfsio"):
return "service"
parts = path.lstrip("/").split("/")
if len(parts) == 0:
return "service"
elif len(parts) == 1:
return "bucket"
else:
return "object"

View File

@@ -1,21 +1,130 @@
"""Background replication worker."""
from __future__ import annotations
import json
import logging
import mimetypes
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Optional
from typing import Any, Dict, List, Optional
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
from boto3.exceptions import S3UploadFailedError
from .connections import ConnectionStore, RemoteConnection
from .storage import ObjectStorage
from .storage import ObjectStorage, StorageError
logger = logging.getLogger(__name__)
REPLICATION_USER_AGENT = "S3ReplicationAgent/1.0"
REPLICATION_MODE_NEW_ONLY = "new_only"
REPLICATION_MODE_ALL = "all"
REPLICATION_MODE_BIDIRECTIONAL = "bidirectional"
def _create_s3_client(
connection: RemoteConnection,
*,
health_check: bool = False,
connect_timeout: int = 5,
read_timeout: int = 30,
max_retries: int = 2,
) -> Any:
"""Create a boto3 S3 client for the given connection.
Args:
connection: Remote S3 connection configuration
health_check: If True, use minimal retries for quick health checks
"""
config = Config(
user_agent_extra=REPLICATION_USER_AGENT,
connect_timeout=connect_timeout,
read_timeout=read_timeout,
retries={'max_attempts': 1 if health_check else max_retries},
signature_version='s3v4',
s3={'addressing_style': 'path'},
request_checksum_calculation='when_required',
response_checksum_validation='when_required',
)
return boto3.client(
"s3",
endpoint_url=connection.endpoint_url,
aws_access_key_id=connection.access_key,
aws_secret_access_key=connection.secret_key,
region_name=connection.region or 'us-east-1',
config=config,
)
@dataclass
class ReplicationStats:
"""Statistics for replication operations - computed dynamically."""
objects_synced: int = 0
objects_pending: int = 0
objects_orphaned: int = 0
bytes_synced: int = 0
last_sync_at: Optional[float] = None
last_sync_key: Optional[str] = None
def to_dict(self) -> dict:
return {
"objects_synced": self.objects_synced,
"objects_pending": self.objects_pending,
"objects_orphaned": self.objects_orphaned,
"bytes_synced": self.bytes_synced,
"last_sync_at": self.last_sync_at,
"last_sync_key": self.last_sync_key,
}
@classmethod
def from_dict(cls, data: dict) -> "ReplicationStats":
return cls(
objects_synced=data.get("objects_synced", 0),
objects_pending=data.get("objects_pending", 0),
objects_orphaned=data.get("objects_orphaned", 0),
bytes_synced=data.get("bytes_synced", 0),
last_sync_at=data.get("last_sync_at"),
last_sync_key=data.get("last_sync_key"),
)
@dataclass
class ReplicationFailure:
object_key: str
error_message: str
timestamp: float
failure_count: int
bucket_name: str
action: str
last_error_code: Optional[str] = None
def to_dict(self) -> dict:
return {
"object_key": self.object_key,
"error_message": self.error_message,
"timestamp": self.timestamp,
"failure_count": self.failure_count,
"bucket_name": self.bucket_name,
"action": self.action,
"last_error_code": self.last_error_code,
}
@classmethod
def from_dict(cls, data: dict) -> "ReplicationFailure":
return cls(
object_key=data["object_key"],
error_message=data["error_message"],
timestamp=data["timestamp"],
failure_count=data["failure_count"],
bucket_name=data["bucket_name"],
action=data["action"],
last_error_code=data.get("last_error_code"),
)
@dataclass
class ReplicationRule:
@@ -23,50 +132,325 @@ class ReplicationRule:
target_connection_id: str
target_bucket: str
enabled: bool = True
mode: str = REPLICATION_MODE_NEW_ONLY
created_at: Optional[float] = None
stats: ReplicationStats = field(default_factory=ReplicationStats)
sync_deletions: bool = True
last_pull_at: Optional[float] = None
filter_prefix: Optional[str] = None
def to_dict(self) -> dict:
return {
"bucket_name": self.bucket_name,
"target_connection_id": self.target_connection_id,
"target_bucket": self.target_bucket,
"enabled": self.enabled,
"mode": self.mode,
"created_at": self.created_at,
"stats": self.stats.to_dict(),
"sync_deletions": self.sync_deletions,
"last_pull_at": self.last_pull_at,
"filter_prefix": self.filter_prefix,
}
@classmethod
def from_dict(cls, data: dict) -> "ReplicationRule":
stats_data = data.pop("stats", {})
if "mode" not in data:
data["mode"] = REPLICATION_MODE_NEW_ONLY
if "created_at" not in data:
data["created_at"] = None
if "sync_deletions" not in data:
data["sync_deletions"] = True
if "last_pull_at" not in data:
data["last_pull_at"] = None
if "filter_prefix" not in data:
data["filter_prefix"] = None
rule = cls(**data)
rule.stats = ReplicationStats.from_dict(stats_data) if stats_data else ReplicationStats()
return rule
class ReplicationFailureStore:
def __init__(self, storage_root: Path, max_failures_per_bucket: int = 50) -> None:
self.storage_root = storage_root
self.max_failures_per_bucket = max_failures_per_bucket
self._lock = threading.Lock()
self._cache: Dict[str, List[ReplicationFailure]] = {}
def _get_failures_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "replication_failures.json"
def _load_from_disk(self, bucket_name: str) -> List[ReplicationFailure]:
path = self._get_failures_path(bucket_name)
if not path.exists():
return []
try:
with open(path, "r") as f:
data = json.load(f)
return [ReplicationFailure.from_dict(d) for d in data.get("failures", [])]
except (OSError, ValueError, KeyError) as e:
logger.error(f"Failed to load replication failures for {bucket_name}: {e}")
return []
def _save_to_disk(self, bucket_name: str, failures: List[ReplicationFailure]) -> None:
path = self._get_failures_path(bucket_name)
path.parent.mkdir(parents=True, exist_ok=True)
data = {"failures": [f.to_dict() for f in failures[:self.max_failures_per_bucket]]}
try:
with open(path, "w") as f:
json.dump(data, f, indent=2)
except OSError as e:
logger.error(f"Failed to save replication failures for {bucket_name}: {e}")
def load_failures(self, bucket_name: str) -> List[ReplicationFailure]:
if bucket_name in self._cache:
return list(self._cache[bucket_name])
failures = self._load_from_disk(bucket_name)
self._cache[bucket_name] = failures
return list(failures)
def save_failures(self, bucket_name: str, failures: List[ReplicationFailure]) -> None:
trimmed = failures[:self.max_failures_per_bucket]
self._cache[bucket_name] = trimmed
self._save_to_disk(bucket_name, trimmed)
def add_failure(self, bucket_name: str, failure: ReplicationFailure) -> None:
with self._lock:
failures = self.load_failures(bucket_name)
existing = next((f for f in failures if f.object_key == failure.object_key), None)
if existing:
existing.failure_count += 1
existing.timestamp = failure.timestamp
existing.error_message = failure.error_message
existing.last_error_code = failure.last_error_code
else:
failures.insert(0, failure)
self.save_failures(bucket_name, failures)
def remove_failure(self, bucket_name: str, object_key: str) -> bool:
with self._lock:
failures = self.load_failures(bucket_name)
original_len = len(failures)
failures = [f for f in failures if f.object_key != object_key]
if len(failures) < original_len:
self.save_failures(bucket_name, failures)
return True
return False
def clear_failures(self, bucket_name: str) -> None:
with self._lock:
self._cache.pop(bucket_name, None)
path = self._get_failures_path(bucket_name)
if path.exists():
path.unlink()
def get_failure(self, bucket_name: str, object_key: str) -> Optional[ReplicationFailure]:
failures = self.load_failures(bucket_name)
return next((f for f in failures if f.object_key == object_key), None)
def get_failure_count(self, bucket_name: str) -> int:
return len(self.load_failures(bucket_name))
class ReplicationManager:
def __init__(self, storage: ObjectStorage, connections: ConnectionStore, rules_path: Path) -> None:
def __init__(
self,
storage: ObjectStorage,
connections: ConnectionStore,
rules_path: Path,
storage_root: Path,
connect_timeout: int = 5,
read_timeout: int = 30,
max_retries: int = 2,
streaming_threshold_bytes: int = 10 * 1024 * 1024,
max_failures_per_bucket: int = 50,
) -> None:
self.storage = storage
self.connections = connections
self.rules_path = rules_path
self.storage_root = storage_root
self.connect_timeout = connect_timeout
self.read_timeout = read_timeout
self.max_retries = max_retries
self.streaming_threshold_bytes = streaming_threshold_bytes
self._rules: Dict[str, ReplicationRule] = {}
self._stats_lock = threading.Lock()
self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="ReplicationWorker")
self._shutdown = False
self.failure_store = ReplicationFailureStore(storage_root, max_failures_per_bucket)
self.reload_rules()
def _create_client(self, connection: RemoteConnection, *, health_check: bool = False) -> Any:
"""Create an S3 client with the manager's configured timeouts."""
return _create_s3_client(
connection,
health_check=health_check,
connect_timeout=self.connect_timeout,
read_timeout=self.read_timeout,
max_retries=self.max_retries,
)
def shutdown(self, wait: bool = True) -> None:
"""Shutdown the replication executor gracefully.
Args:
wait: If True, wait for pending tasks to complete
"""
self._shutdown = True
self._executor.shutdown(wait=wait)
logger.info("Replication manager shut down")
def reload_rules(self) -> None:
if not self.rules_path.exists():
self._rules = {}
return
try:
import json
with open(self.rules_path, "r") as f:
data = json.load(f)
for bucket, rule_data in data.items():
self._rules[bucket] = ReplicationRule(**rule_data)
self._rules[bucket] = ReplicationRule.from_dict(rule_data)
except (OSError, ValueError) as e:
logger.error(f"Failed to load replication rules: {e}")
def save_rules(self) -> None:
import json
data = {b: rule.__dict__ for b, rule in self._rules.items()}
data = {b: rule.to_dict() for b, rule in self._rules.items()}
self.rules_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.rules_path, "w") as f:
json.dump(data, f, indent=2)
def check_endpoint_health(self, connection: RemoteConnection) -> bool:
"""Check if a remote endpoint is reachable and responsive.
Returns True if endpoint is healthy, False otherwise.
Uses short timeouts to prevent blocking.
"""
try:
s3 = self._create_client(connection, health_check=True)
s3.list_buckets()
return True
except Exception as e:
logger.warning(f"Endpoint health check failed for {connection.name} ({connection.endpoint_url}): {e}")
return False
def get_rule(self, bucket_name: str) -> Optional[ReplicationRule]:
return self._rules.get(bucket_name)
def list_rules(self) -> List[ReplicationRule]:
return list(self._rules.values())
def set_rule(self, rule: ReplicationRule) -> None:
old_rule = self._rules.get(rule.bucket_name)
was_all_mode = old_rule and old_rule.mode == REPLICATION_MODE_ALL if old_rule else False
self._rules[rule.bucket_name] = rule
self.save_rules()
if rule.mode == REPLICATION_MODE_ALL and rule.enabled and not was_all_mode:
logger.info(f"Replication mode ALL enabled for {rule.bucket_name}, triggering sync of existing objects")
self._executor.submit(self.replicate_existing_objects, rule.bucket_name)
def delete_rule(self, bucket_name: str) -> None:
if bucket_name in self._rules:
del self._rules[bucket_name]
self.save_rules()
def trigger_replication(self, bucket_name: str, object_key: str) -> None:
def _update_last_sync(self, bucket_name: str, object_key: str = "") -> None:
"""Update last sync timestamp after a successful operation."""
with self._stats_lock:
rule = self._rules.get(bucket_name)
if not rule:
return
rule.stats.last_sync_at = time.time()
rule.stats.last_sync_key = object_key
self.save_rules()
def get_sync_status(self, bucket_name: str) -> Optional[ReplicationStats]:
"""Dynamically compute replication status by comparing source and destination buckets."""
rule = self.get_rule(bucket_name)
if not rule:
return None
connection = self.connections.get(rule.target_connection_id)
if not connection:
return rule.stats
try:
source_objects = self.storage.list_objects_all(bucket_name)
source_keys = {obj.key: obj.size for obj in source_objects}
s3 = self._create_client(connection)
dest_keys = set()
bytes_synced = 0
paginator = s3.get_paginator('list_objects_v2')
try:
for page in paginator.paginate(Bucket=rule.target_bucket):
for obj in page.get('Contents', []):
dest_keys.add(obj['Key'])
if obj['Key'] in source_keys:
bytes_synced += obj.get('Size', 0)
except ClientError as e:
if e.response['Error']['Code'] == 'NoSuchBucket':
dest_keys = set()
else:
raise
synced = source_keys.keys() & dest_keys
orphaned = dest_keys - source_keys.keys()
if rule.mode == REPLICATION_MODE_ALL:
pending = source_keys.keys() - dest_keys
else:
pending = set()
rule.stats.objects_synced = len(synced)
rule.stats.objects_pending = len(pending)
rule.stats.objects_orphaned = len(orphaned)
rule.stats.bytes_synced = bytes_synced
return rule.stats
except (ClientError, StorageError) as e:
logger.error(f"Failed to compute sync status for {bucket_name}: {e}")
return rule.stats
def replicate_existing_objects(self, bucket_name: str) -> None:
"""Trigger replication for all existing objects in a bucket."""
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning(f"Cannot replicate existing objects: Connection {rule.target_connection_id} not found")
return
if not self.check_endpoint_health(connection):
logger.warning(f"Cannot replicate existing objects: Endpoint {connection.name} ({connection.endpoint_url}) is not reachable")
return
try:
objects = self.storage.list_objects_all(bucket_name)
logger.info(f"Starting replication of {len(objects)} existing objects from {bucket_name}")
for obj in objects:
self._executor.submit(self._replicate_task, bucket_name, obj.key, rule, connection, "write")
except StorageError as e:
logger.error(f"Failed to list objects for replication: {e}")
def create_remote_bucket(self, connection_id: str, bucket_name: str) -> None:
"""Create a bucket on the remote connection."""
connection = self.connections.get(connection_id)
if not connection:
raise ValueError(f"Connection {connection_id} not found")
try:
s3 = self._create_client(connection)
s3.create_bucket(Bucket=bucket_name)
except ClientError as e:
logger.error(f"Failed to create remote bucket {bucket_name}: {e}")
raise
def trigger_replication(self, bucket_name: str, object_key: str, action: str = "write") -> None:
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return
@@ -76,46 +460,208 @@ class ReplicationManager:
logger.warning(f"Replication skipped for {bucket_name}/{object_key}: Connection {rule.target_connection_id} not found")
return
self._executor.submit(self._replicate_task, bucket_name, object_key, rule, connection)
def _replicate_task(self, bucket_name: str, object_key: str, rule: ReplicationRule, conn: RemoteConnection) -> None:
try:
# 1. Get local file path
# Note: We are accessing internal storage structure here.
# Ideally storage.py should expose a 'get_file_path' or we read the stream.
# For efficiency, we'll try to read the file directly if we can, or use storage.get_object
# Using boto3 to upload
s3 = boto3.client(
"s3",
endpoint_url=conn.endpoint_url,
aws_access_key_id=conn.access_key,
aws_secret_access_key=conn.secret_key,
region_name=conn.region,
)
# We need the file content.
# Since ObjectStorage is filesystem based, let's get the stream.
# We need to be careful about closing it.
meta = self.storage.get_object_meta(bucket_name, object_key)
if not meta:
if not self.check_endpoint_health(connection):
logger.warning(f"Replication skipped for {bucket_name}/{object_key}: Endpoint {connection.name} ({connection.endpoint_url}) is not reachable")
return
with self.storage.open_object(bucket_name, object_key) as f:
extra_args = {}
if meta.metadata:
extra_args["Metadata"] = meta.metadata
self._executor.submit(self._replicate_task, bucket_name, object_key, rule, connection, action)
s3.upload_fileobj(
f,
def _replicate_task(self, bucket_name: str, object_key: str, rule: ReplicationRule, conn: RemoteConnection, action: str) -> None:
if self._shutdown:
return
current_rule = self.get_rule(bucket_name)
if not current_rule or not current_rule.enabled:
logger.debug(f"Replication skipped for {bucket_name}/{object_key}: rule disabled or removed")
return
if ".." in object_key or object_key.startswith("/") or object_key.startswith("\\"):
logger.error(f"Invalid object key in replication (path traversal attempt): {object_key}")
return
try:
from .storage import ObjectStorage
ObjectStorage._sanitize_object_key(object_key)
except StorageError as e:
logger.error(f"Object key validation failed in replication: {e}")
return
try:
s3 = self._create_client(conn)
if action == "delete":
try:
s3.delete_object(Bucket=rule.target_bucket, Key=object_key)
logger.info(f"Replicated DELETE {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
self._update_last_sync(bucket_name, object_key)
self.failure_store.remove_failure(bucket_name, object_key)
except ClientError as e:
error_code = e.response.get('Error', {}).get('Code')
logger.error(f"Replication DELETE failed for {bucket_name}/{object_key}: {e}")
self.failure_store.add_failure(bucket_name, ReplicationFailure(
object_key=object_key,
error_message=str(e),
timestamp=time.time(),
failure_count=1,
bucket_name=bucket_name,
action="delete",
last_error_code=error_code,
))
return
try:
path = self.storage.get_object_path(bucket_name, object_key)
except StorageError:
logger.error(f"Source object not found: {bucket_name}/{object_key}")
return
content_type, _ = mimetypes.guess_type(path)
file_size = path.stat().st_size
logger.info(f"Replicating {bucket_name}/{object_key}: Size={file_size}, ContentType={content_type}")
def do_upload() -> None:
"""Upload object using appropriate method based on file size.
For small files (< 10 MiB): Read into memory for simpler handling
For large files: Use streaming upload to avoid memory issues
"""
extra_args = {}
if content_type:
extra_args["ContentType"] = content_type
if file_size >= self.streaming_threshold_bytes:
s3.upload_file(
str(path),
rule.target_bucket,
object_key,
ExtraArgs=extra_args
ExtraArgs=extra_args if extra_args else None,
)
else:
file_content = path.read_bytes()
put_kwargs = {
"Bucket": rule.target_bucket,
"Key": object_key,
"Body": file_content,
**extra_args,
}
s3.put_object(**put_kwargs)
try:
do_upload()
except (ClientError, S3UploadFailedError) as e:
error_code = None
if isinstance(e, ClientError):
error_code = e.response['Error']['Code']
elif isinstance(e, S3UploadFailedError):
if "NoSuchBucket" in str(e):
error_code = 'NoSuchBucket'
if error_code == 'NoSuchBucket':
logger.info(f"Target bucket {rule.target_bucket} not found. Attempting to create it.")
bucket_ready = False
try:
s3.create_bucket(Bucket=rule.target_bucket)
bucket_ready = True
logger.info(f"Created target bucket {rule.target_bucket}")
except ClientError as bucket_err:
if bucket_err.response['Error']['Code'] in ('BucketAlreadyExists', 'BucketAlreadyOwnedByYou'):
logger.debug(f"Bucket {rule.target_bucket} already exists (created by another thread)")
bucket_ready = True
else:
logger.error(f"Failed to create target bucket {rule.target_bucket}: {bucket_err}")
raise e
if bucket_ready:
do_upload()
else:
raise e
logger.info(f"Replicated {bucket_name}/{object_key} to {conn.name} ({rule.target_bucket})")
self._update_last_sync(bucket_name, object_key)
self.failure_store.remove_failure(bucket_name, object_key)
except (ClientError, OSError, ValueError) as e:
error_code = None
if isinstance(e, ClientError):
error_code = e.response.get('Error', {}).get('Code')
logger.error(f"Replication failed for {bucket_name}/{object_key}: {e}")
except Exception:
self.failure_store.add_failure(bucket_name, ReplicationFailure(
object_key=object_key,
error_message=str(e),
timestamp=time.time(),
failure_count=1,
bucket_name=bucket_name,
action=action,
last_error_code=error_code,
))
except Exception as e:
logger.exception(f"Unexpected error during replication for {bucket_name}/{object_key}")
self.failure_store.add_failure(bucket_name, ReplicationFailure(
object_key=object_key,
error_message=str(e),
timestamp=time.time(),
failure_count=1,
bucket_name=bucket_name,
action=action,
last_error_code=None,
))
def get_failed_items(self, bucket_name: str, limit: int = 50, offset: int = 0) -> List[ReplicationFailure]:
failures = self.failure_store.load_failures(bucket_name)
return failures[offset:offset + limit]
def get_failure_count(self, bucket_name: str) -> int:
return self.failure_store.get_failure_count(bucket_name)
def retry_failed_item(self, bucket_name: str, object_key: str) -> bool:
failure = self.failure_store.get_failure(bucket_name, object_key)
if not failure:
return False
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return False
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning(f"Cannot retry: Connection {rule.target_connection_id} not found")
return False
if not self.check_endpoint_health(connection):
logger.warning(f"Cannot retry: Endpoint {connection.name} is not reachable")
return False
self._executor.submit(self._replicate_task, bucket_name, object_key, rule, connection, failure.action)
return True
def retry_all_failed(self, bucket_name: str) -> Dict[str, int]:
failures = self.failure_store.load_failures(bucket_name)
if not failures:
return {"submitted": 0, "skipped": 0}
rule = self.get_rule(bucket_name)
if not rule or not rule.enabled:
return {"submitted": 0, "skipped": len(failures)}
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning(f"Cannot retry: Connection {rule.target_connection_id} not found")
return {"submitted": 0, "skipped": len(failures)}
if not self.check_endpoint_health(connection):
logger.warning(f"Cannot retry: Endpoint {connection.name} is not reachable")
return {"submitted": 0, "skipped": len(failures)}
submitted = 0
for failure in failures:
self._executor.submit(self._replicate_task, bucket_name, failure.object_key, rule, connection, failure.action)
submitted += 1
return {"submitted": submitted, "skipped": 0}
def dismiss_failure(self, bucket_name: str, object_key: str) -> bool:
return self.failure_store.remove_failure(bucket_name, object_key)
def clear_failures(self, bucket_name: str) -> None:
self.failure_store.clear_failures(bucket_name)

File diff suppressed because it is too large Load Diff

296
app/s3_client.py Normal file
View File

@@ -0,0 +1,296 @@
from __future__ import annotations
import json
import logging
import threading
import time
from typing import Any, Generator, Optional
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError, EndpointConnectionError, ConnectionClosedError
from flask import current_app, session
logger = logging.getLogger(__name__)
UI_PROXY_USER_AGENT = "MyFSIO-UIProxy/1.0"
_BOTO_ERROR_MAP = {
"NoSuchBucket": 404,
"NoSuchKey": 404,
"NoSuchUpload": 404,
"BucketAlreadyExists": 409,
"BucketAlreadyOwnedByYou": 409,
"BucketNotEmpty": 409,
"AccessDenied": 403,
"InvalidAccessKeyId": 403,
"SignatureDoesNotMatch": 403,
"InvalidBucketName": 400,
"InvalidArgument": 400,
"MalformedXML": 400,
"EntityTooLarge": 400,
"QuotaExceeded": 403,
}
_UPLOAD_REGISTRY_MAX_AGE = 86400
_UPLOAD_REGISTRY_CLEANUP_INTERVAL = 3600
class UploadRegistry:
def __init__(self) -> None:
self._entries: dict[str, tuple[str, str, float]] = {}
self._lock = threading.Lock()
self._last_cleanup = time.monotonic()
def register(self, upload_id: str, bucket_name: str, object_key: str) -> None:
with self._lock:
self._entries[upload_id] = (bucket_name, object_key, time.monotonic())
self._maybe_cleanup()
def get_key(self, upload_id: str, bucket_name: str) -> Optional[str]:
with self._lock:
entry = self._entries.get(upload_id)
if entry is None:
return None
stored_bucket, key, created_at = entry
if stored_bucket != bucket_name:
return None
if time.monotonic() - created_at > _UPLOAD_REGISTRY_MAX_AGE:
del self._entries[upload_id]
return None
return key
def remove(self, upload_id: str) -> None:
with self._lock:
self._entries.pop(upload_id, None)
def _maybe_cleanup(self) -> None:
now = time.monotonic()
if now - self._last_cleanup < _UPLOAD_REGISTRY_CLEANUP_INTERVAL:
return
self._last_cleanup = now
cutoff = now - _UPLOAD_REGISTRY_MAX_AGE
stale = [uid for uid, (_, _, ts) in self._entries.items() if ts < cutoff]
for uid in stale:
del self._entries[uid]
class S3ProxyClient:
def __init__(self, api_base_url: str, region: str = "us-east-1") -> None:
if not api_base_url:
raise ValueError("api_base_url is required for S3ProxyClient")
self._api_base_url = api_base_url.rstrip("/")
self._region = region
self.upload_registry = UploadRegistry()
@property
def api_base_url(self) -> str:
return self._api_base_url
def get_client(self, access_key: str, secret_key: str) -> Any:
if not access_key or not secret_key:
raise ValueError("Both access_key and secret_key are required")
config = Config(
user_agent_extra=UI_PROXY_USER_AGENT,
connect_timeout=5,
read_timeout=30,
retries={"max_attempts": 0},
signature_version="s3v4",
s3={"addressing_style": "path"},
request_checksum_calculation="when_required",
response_checksum_validation="when_required",
)
return boto3.client(
"s3",
endpoint_url=self._api_base_url,
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
region_name=self._region,
config=config,
)
def _get_proxy() -> S3ProxyClient:
proxy = current_app.extensions.get("s3_proxy")
if proxy is None:
raise RuntimeError(
"S3 proxy not configured. Set API_BASE_URL or run both API and UI servers."
)
return proxy
def _get_session_creds() -> tuple[str, str]:
secret_store = current_app.extensions["secret_store"]
secret_store.purge_expired()
token = session.get("cred_token")
if not token:
raise PermissionError("Not authenticated")
creds = secret_store.peek(token)
if not creds:
raise PermissionError("Session expired")
access_key = creds.get("access_key", "")
secret_key = creds.get("secret_key", "")
if not access_key or not secret_key:
raise PermissionError("Invalid session credentials")
return access_key, secret_key
def get_session_s3_client() -> Any:
proxy = _get_proxy()
access_key, secret_key = _get_session_creds()
return proxy.get_client(access_key, secret_key)
def get_upload_registry() -> UploadRegistry:
return _get_proxy().upload_registry
def handle_client_error(exc: ClientError) -> tuple[dict[str, str], int]:
error_info = exc.response.get("Error", {})
code = error_info.get("Code", "InternalError")
message = error_info.get("Message") or "S3 operation failed"
http_status = _BOTO_ERROR_MAP.get(code)
if http_status is None:
http_status = exc.response.get("ResponseMetadata", {}).get("HTTPStatusCode", 500)
return {"error": message}, http_status
def handle_connection_error(exc: Exception) -> tuple[dict[str, str], int]:
logger.error("S3 API connection failed: %s", exc)
return {"error": "S3 API server is unreachable. Ensure the API server is running."}, 502
def format_datetime_display(dt: Any, display_tz: str = "UTC") -> str:
from .ui import _format_datetime_display
return _format_datetime_display(dt, display_tz)
def format_datetime_iso(dt: Any, display_tz: str = "UTC") -> str:
from .ui import _format_datetime_iso
return _format_datetime_iso(dt, display_tz)
def build_url_templates(bucket_name: str) -> dict[str, str]:
from flask import url_for
preview_t = url_for("ui.object_preview", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
delete_t = url_for("ui.delete_object", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
presign_t = url_for("ui.object_presign", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
versions_t = url_for("ui.object_versions", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
restore_t = url_for(
"ui.restore_object_version",
bucket_name=bucket_name,
object_key="KEY_PLACEHOLDER",
version_id="VERSION_ID_PLACEHOLDER",
)
tags_t = url_for("ui.object_tags", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
copy_t = url_for("ui.copy_object", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
move_t = url_for("ui.move_object", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
metadata_t = url_for("ui.object_metadata", bucket_name=bucket_name, object_key="KEY_PLACEHOLDER")
return {
"preview": preview_t,
"download": preview_t + "?download=1",
"presign": presign_t,
"delete": delete_t,
"versions": versions_t,
"restore": restore_t,
"tags": tags_t,
"copy": copy_t,
"move": move_t,
"metadata": metadata_t,
}
def translate_list_objects(
boto3_response: dict[str, Any],
url_templates: dict[str, str],
display_tz: str = "UTC",
versioning_enabled: bool = False,
) -> dict[str, Any]:
objects_data = []
for obj in boto3_response.get("Contents", []):
last_mod = obj["LastModified"]
objects_data.append({
"key": obj["Key"],
"size": obj["Size"],
"last_modified": last_mod.isoformat(),
"last_modified_display": format_datetime_display(last_mod, display_tz),
"last_modified_iso": format_datetime_iso(last_mod, display_tz),
"etag": obj.get("ETag", "").strip('"'),
})
return {
"objects": objects_data,
"is_truncated": boto3_response.get("IsTruncated", False),
"next_continuation_token": boto3_response.get("NextContinuationToken"),
"total_count": boto3_response.get("KeyCount", len(objects_data)),
"versioning_enabled": versioning_enabled,
"url_templates": url_templates,
}
def get_versioning_via_s3(client: Any, bucket_name: str) -> bool:
try:
resp = client.get_bucket_versioning(Bucket=bucket_name)
return resp.get("Status") == "Enabled"
except ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "")
if code != "NoSuchBucket":
logger.warning("Failed to check versioning for %s: %s", bucket_name, code)
return False
def stream_objects_ndjson(
client: Any,
bucket_name: str,
prefix: Optional[str],
url_templates: dict[str, str],
display_tz: str = "UTC",
versioning_enabled: bool = False,
delimiter: Optional[str] = None,
) -> Generator[str, None, None]:
meta_line = json.dumps({
"type": "meta",
"versioning_enabled": versioning_enabled,
"url_templates": url_templates,
}) + "\n"
yield meta_line
yield json.dumps({"type": "count", "total_count": 0}) + "\n"
kwargs: dict[str, Any] = {"Bucket": bucket_name, "MaxKeys": 1000}
if prefix:
kwargs["Prefix"] = prefix
if delimiter:
kwargs["Delimiter"] = delimiter
running_count = 0
try:
paginator = client.get_paginator("list_objects_v2")
for page in paginator.paginate(**kwargs):
for cp in page.get("CommonPrefixes", []):
yield json.dumps({
"type": "folder",
"prefix": cp["Prefix"],
}) + "\n"
page_contents = page.get("Contents", [])
for obj in page_contents:
last_mod = obj["LastModified"]
yield json.dumps({
"type": "object",
"key": obj["Key"],
"size": obj["Size"],
"last_modified": last_mod.isoformat(),
"last_modified_display": format_datetime_display(last_mod, display_tz),
"last_modified_iso": format_datetime_iso(last_mod, display_tz),
"etag": obj.get("ETag", "").strip('"'),
}) + "\n"
running_count += len(page_contents)
yield json.dumps({"type": "count", "total_count": running_count}) + "\n"
except ClientError as exc:
error_msg = exc.response.get("Error", {}).get("Message", "S3 operation failed")
yield json.dumps({"type": "error", "error": error_msg}) + "\n"
return
except (EndpointConnectionError, ConnectionClosedError):
yield json.dumps({"type": "error", "error": "S3 API server is unreachable"}) + "\n"
return
yield json.dumps({"type": "done"}) + "\n"

View File

@@ -1,4 +1,3 @@
"""Ephemeral store for one-time secrets communicated to the UI."""
from __future__ import annotations
import secrets
@@ -19,6 +18,18 @@ class EphemeralSecretStore:
self._store[token] = (payload, expires_at)
return token
def peek(self, token: str | None) -> Any | None:
if not token:
return None
entry = self._store.get(token)
if not entry:
return None
payload, expires_at = entry
if expires_at < time.time():
self._store.pop(token, None)
return None
return payload
def pop(self, token: str | None) -> Any | None:
if not token:
return None

171
app/select_content.py Normal file
View File

@@ -0,0 +1,171 @@
"""S3 SelectObjectContent SQL query execution using DuckDB."""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Dict, Generator, Optional
try:
import duckdb
DUCKDB_AVAILABLE = True
except ImportError:
DUCKDB_AVAILABLE = False
class SelectError(Exception):
"""Error during SELECT query execution."""
pass
def execute_select_query(
file_path: Path,
expression: str,
input_format: str,
input_config: Dict[str, Any],
output_format: str,
output_config: Dict[str, Any],
chunk_size: int = 65536,
) -> Generator[bytes, None, None]:
"""Execute SQL query on object content."""
if not DUCKDB_AVAILABLE:
raise SelectError("DuckDB is not installed. Install with: pip install duckdb")
conn = duckdb.connect(":memory:")
try:
if input_format == "CSV":
_load_csv(conn, file_path, input_config)
elif input_format == "JSON":
_load_json(conn, file_path, input_config)
elif input_format == "Parquet":
_load_parquet(conn, file_path)
else:
raise SelectError(f"Unsupported input format: {input_format}")
normalized_expression = expression.replace("s3object", "data").replace("S3Object", "data")
try:
result = conn.execute(normalized_expression)
except duckdb.Error as exc:
raise SelectError(f"SQL execution error: {exc}")
if output_format == "CSV":
yield from _output_csv(result, output_config, chunk_size)
elif output_format == "JSON":
yield from _output_json(result, output_config, chunk_size)
else:
raise SelectError(f"Unsupported output format: {output_format}")
finally:
conn.close()
def _load_csv(conn, file_path: Path, config: Dict[str, Any]) -> None:
"""Load CSV file into DuckDB."""
file_header_info = config.get("file_header_info", "NONE")
delimiter = config.get("field_delimiter", ",")
quote = config.get("quote_character", '"')
header = file_header_info in ("USE", "IGNORE")
path_str = str(file_path).replace("\\", "/")
conn.execute(f"""
CREATE TABLE data AS
SELECT * FROM read_csv('{path_str}',
header={header},
delim='{delimiter}',
quote='{quote}'
)
""")
def _load_json(conn, file_path: Path, config: Dict[str, Any]) -> None:
"""Load JSON file into DuckDB."""
json_type = config.get("type", "DOCUMENT")
path_str = str(file_path).replace("\\", "/")
if json_type == "LINES":
conn.execute(f"""
CREATE TABLE data AS
SELECT * FROM read_json_auto('{path_str}', format='newline_delimited')
""")
else:
conn.execute(f"""
CREATE TABLE data AS
SELECT * FROM read_json_auto('{path_str}', format='array')
""")
def _load_parquet(conn, file_path: Path) -> None:
"""Load Parquet file into DuckDB."""
path_str = str(file_path).replace("\\", "/")
conn.execute(f"CREATE TABLE data AS SELECT * FROM read_parquet('{path_str}')")
def _output_csv(
result,
config: Dict[str, Any],
chunk_size: int,
) -> Generator[bytes, None, None]:
"""Output query results as CSV."""
delimiter = config.get("field_delimiter", ",")
record_delimiter = config.get("record_delimiter", "\n")
quote = config.get("quote_character", '"')
buffer = ""
while True:
rows = result.fetchmany(1000)
if not rows:
break
for row in rows:
fields = []
for value in row:
if value is None:
fields.append("")
elif isinstance(value, str):
if delimiter in value or quote in value or record_delimiter in value:
escaped = value.replace(quote, quote + quote)
fields.append(f'{quote}{escaped}{quote}')
else:
fields.append(value)
else:
fields.append(str(value))
buffer += delimiter.join(fields) + record_delimiter
while len(buffer) >= chunk_size:
yield buffer[:chunk_size].encode("utf-8")
buffer = buffer[chunk_size:]
if buffer:
yield buffer.encode("utf-8")
def _output_json(
result,
config: Dict[str, Any],
chunk_size: int,
) -> Generator[bytes, None, None]:
"""Output query results as JSON Lines."""
record_delimiter = config.get("record_delimiter", "\n")
columns = [desc[0] for desc in result.description]
buffer = ""
while True:
rows = result.fetchmany(1000)
if not rows:
break
for row in rows:
record = dict(zip(columns, row))
buffer += json.dumps(record, default=str) + record_delimiter
while len(buffer) >= chunk_size:
yield buffer[:chunk_size].encode("utf-8")
buffer = buffer[chunk_size:]
if buffer:
yield buffer.encode("utf-8")

177
app/site_registry.py Normal file
View File

@@ -0,0 +1,177 @@
from __future__ import annotations
import json
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
@dataclass
class SiteInfo:
site_id: str
endpoint: str
region: str = "us-east-1"
priority: int = 100
display_name: str = ""
created_at: Optional[float] = None
updated_at: Optional[float] = None
def __post_init__(self) -> None:
if not self.display_name:
self.display_name = self.site_id
if self.created_at is None:
self.created_at = time.time()
def to_dict(self) -> Dict[str, Any]:
return {
"site_id": self.site_id,
"endpoint": self.endpoint,
"region": self.region,
"priority": self.priority,
"display_name": self.display_name,
"created_at": self.created_at,
"updated_at": self.updated_at,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> SiteInfo:
return cls(
site_id=data["site_id"],
endpoint=data.get("endpoint", ""),
region=data.get("region", "us-east-1"),
priority=data.get("priority", 100),
display_name=data.get("display_name", ""),
created_at=data.get("created_at"),
updated_at=data.get("updated_at"),
)
@dataclass
class PeerSite:
site_id: str
endpoint: str
region: str = "us-east-1"
priority: int = 100
display_name: str = ""
created_at: Optional[float] = None
updated_at: Optional[float] = None
connection_id: Optional[str] = None
is_healthy: Optional[bool] = None
last_health_check: Optional[float] = None
def __post_init__(self) -> None:
if not self.display_name:
self.display_name = self.site_id
if self.created_at is None:
self.created_at = time.time()
def to_dict(self) -> Dict[str, Any]:
return {
"site_id": self.site_id,
"endpoint": self.endpoint,
"region": self.region,
"priority": self.priority,
"display_name": self.display_name,
"created_at": self.created_at,
"updated_at": self.updated_at,
"connection_id": self.connection_id,
"is_healthy": self.is_healthy,
"last_health_check": self.last_health_check,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> PeerSite:
return cls(
site_id=data["site_id"],
endpoint=data.get("endpoint", ""),
region=data.get("region", "us-east-1"),
priority=data.get("priority", 100),
display_name=data.get("display_name", ""),
created_at=data.get("created_at"),
updated_at=data.get("updated_at"),
connection_id=data.get("connection_id"),
is_healthy=data.get("is_healthy"),
last_health_check=data.get("last_health_check"),
)
class SiteRegistry:
def __init__(self, config_path: Path) -> None:
self.config_path = config_path
self._local_site: Optional[SiteInfo] = None
self._peers: Dict[str, PeerSite] = {}
self.reload()
def reload(self) -> None:
if not self.config_path.exists():
self._local_site = None
self._peers = {}
return
try:
with open(self.config_path, "r", encoding="utf-8") as f:
data = json.load(f)
if data.get("local"):
self._local_site = SiteInfo.from_dict(data["local"])
else:
self._local_site = None
self._peers = {}
for peer_data in data.get("peers", []):
peer = PeerSite.from_dict(peer_data)
self._peers[peer.site_id] = peer
except (OSError, json.JSONDecodeError, KeyError):
self._local_site = None
self._peers = {}
def save(self) -> None:
self.config_path.parent.mkdir(parents=True, exist_ok=True)
data = {
"local": self._local_site.to_dict() if self._local_site else None,
"peers": [peer.to_dict() for peer in self._peers.values()],
}
with open(self.config_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
def get_local_site(self) -> Optional[SiteInfo]:
return self._local_site
def set_local_site(self, site: SiteInfo) -> None:
site.updated_at = time.time()
self._local_site = site
self.save()
def list_peers(self) -> List[PeerSite]:
return list(self._peers.values())
def get_peer(self, site_id: str) -> Optional[PeerSite]:
return self._peers.get(site_id)
def add_peer(self, peer: PeerSite) -> None:
peer.created_at = peer.created_at or time.time()
self._peers[peer.site_id] = peer
self.save()
def update_peer(self, peer: PeerSite) -> None:
if peer.site_id not in self._peers:
raise ValueError(f"Peer {peer.site_id} not found")
peer.updated_at = time.time()
self._peers[peer.site_id] = peer
self.save()
def delete_peer(self, site_id: str) -> bool:
if site_id in self._peers:
del self._peers[site_id]
self.save()
return True
return False
def update_health(self, site_id: str, is_healthy: bool) -> None:
peer = self._peers.get(site_id)
if peer:
peer.is_healthy = is_healthy
peer.last_health_check = time.time()
self.save()

416
app/site_sync.py Normal file
View File

@@ -0,0 +1,416 @@
from __future__ import annotations
import json
import logging
import tempfile
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, TYPE_CHECKING
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
if TYPE_CHECKING:
from .connections import ConnectionStore, RemoteConnection
from .replication import ReplicationManager, ReplicationRule
from .storage import ObjectStorage
logger = logging.getLogger(__name__)
SITE_SYNC_USER_AGENT = "SiteSyncAgent/1.0"
@dataclass
class SyncedObjectInfo:
last_synced_at: float
remote_etag: str
source: str
def to_dict(self) -> Dict[str, Any]:
return {
"last_synced_at": self.last_synced_at,
"remote_etag": self.remote_etag,
"source": self.source,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SyncedObjectInfo":
return cls(
last_synced_at=data["last_synced_at"],
remote_etag=data["remote_etag"],
source=data["source"],
)
@dataclass
class SyncState:
synced_objects: Dict[str, SyncedObjectInfo] = field(default_factory=dict)
last_full_sync: Optional[float] = None
def to_dict(self) -> Dict[str, Any]:
return {
"synced_objects": {k: v.to_dict() for k, v in self.synced_objects.items()},
"last_full_sync": self.last_full_sync,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SyncState":
synced_objects = {}
for k, v in data.get("synced_objects", {}).items():
synced_objects[k] = SyncedObjectInfo.from_dict(v)
return cls(
synced_objects=synced_objects,
last_full_sync=data.get("last_full_sync"),
)
@dataclass
class SiteSyncStats:
last_sync_at: Optional[float] = None
objects_pulled: int = 0
objects_skipped: int = 0
conflicts_resolved: int = 0
deletions_applied: int = 0
errors: int = 0
def to_dict(self) -> Dict[str, Any]:
return {
"last_sync_at": self.last_sync_at,
"objects_pulled": self.objects_pulled,
"objects_skipped": self.objects_skipped,
"conflicts_resolved": self.conflicts_resolved,
"deletions_applied": self.deletions_applied,
"errors": self.errors,
}
@dataclass
class RemoteObjectMeta:
key: str
size: int
last_modified: datetime
etag: str
@classmethod
def from_s3_object(cls, obj: Dict[str, Any]) -> "RemoteObjectMeta":
return cls(
key=obj["Key"],
size=obj.get("Size", 0),
last_modified=obj["LastModified"],
etag=obj.get("ETag", "").strip('"'),
)
def _create_sync_client(
connection: "RemoteConnection",
*,
connect_timeout: int = 10,
read_timeout: int = 120,
max_retries: int = 2,
) -> Any:
config = Config(
user_agent_extra=SITE_SYNC_USER_AGENT,
connect_timeout=connect_timeout,
read_timeout=read_timeout,
retries={"max_attempts": max_retries},
signature_version="s3v4",
s3={"addressing_style": "path"},
request_checksum_calculation="when_required",
response_checksum_validation="when_required",
)
return boto3.client(
"s3",
endpoint_url=connection.endpoint_url,
aws_access_key_id=connection.access_key,
aws_secret_access_key=connection.secret_key,
region_name=connection.region or "us-east-1",
config=config,
)
class SiteSyncWorker:
def __init__(
self,
storage: "ObjectStorage",
connections: "ConnectionStore",
replication_manager: "ReplicationManager",
storage_root: Path,
interval_seconds: int = 60,
batch_size: int = 100,
connect_timeout: int = 10,
read_timeout: int = 120,
max_retries: int = 2,
clock_skew_tolerance_seconds: float = 1.0,
):
self.storage = storage
self.connections = connections
self.replication_manager = replication_manager
self.storage_root = storage_root
self.interval_seconds = interval_seconds
self.batch_size = batch_size
self.connect_timeout = connect_timeout
self.read_timeout = read_timeout
self.max_retries = max_retries
self.clock_skew_tolerance_seconds = clock_skew_tolerance_seconds
self._lock = threading.Lock()
self._shutdown = threading.Event()
self._sync_thread: Optional[threading.Thread] = None
self._bucket_stats: Dict[str, SiteSyncStats] = {}
def _create_client(self, connection: "RemoteConnection") -> Any:
"""Create an S3 client with the worker's configured timeouts."""
return _create_sync_client(
connection,
connect_timeout=self.connect_timeout,
read_timeout=self.read_timeout,
max_retries=self.max_retries,
)
def start(self) -> None:
if self._sync_thread is not None and self._sync_thread.is_alive():
return
self._shutdown.clear()
self._sync_thread = threading.Thread(
target=self._sync_loop, name="site-sync-worker", daemon=True
)
self._sync_thread.start()
logger.info("Site sync worker started (interval=%ds)", self.interval_seconds)
def shutdown(self) -> None:
self._shutdown.set()
if self._sync_thread is not None:
self._sync_thread.join(timeout=10.0)
logger.info("Site sync worker shut down")
def trigger_sync(self, bucket_name: str) -> Optional[SiteSyncStats]:
from .replication import REPLICATION_MODE_BIDIRECTIONAL
rule = self.replication_manager.get_rule(bucket_name)
if not rule or rule.mode != REPLICATION_MODE_BIDIRECTIONAL or not rule.enabled:
return None
return self._sync_bucket(rule)
def get_stats(self, bucket_name: str) -> Optional[SiteSyncStats]:
with self._lock:
return self._bucket_stats.get(bucket_name)
def _sync_loop(self) -> None:
while not self._shutdown.is_set():
self._shutdown.wait(timeout=self.interval_seconds)
if self._shutdown.is_set():
break
self._run_sync_cycle()
def _run_sync_cycle(self) -> None:
from .replication import REPLICATION_MODE_BIDIRECTIONAL
for bucket_name, rule in list(self.replication_manager._rules.items()):
if self._shutdown.is_set():
break
if rule.mode != REPLICATION_MODE_BIDIRECTIONAL or not rule.enabled:
continue
try:
stats = self._sync_bucket(rule)
with self._lock:
self._bucket_stats[bucket_name] = stats
except Exception as e:
logger.exception("Site sync failed for bucket %s: %s", bucket_name, e)
def _sync_bucket(self, rule: "ReplicationRule") -> SiteSyncStats:
stats = SiteSyncStats()
connection = self.connections.get(rule.target_connection_id)
if not connection:
logger.warning("Connection %s not found for bucket %s", rule.target_connection_id, rule.bucket_name)
stats.errors += 1
return stats
try:
local_objects = self._list_local_objects(rule.bucket_name)
except Exception as e:
logger.error("Failed to list local objects for %s: %s", rule.bucket_name, e)
stats.errors += 1
return stats
try:
remote_objects = self._list_remote_objects(rule, connection)
except Exception as e:
logger.error("Failed to list remote objects for %s: %s", rule.bucket_name, e)
stats.errors += 1
return stats
sync_state = self._load_sync_state(rule.bucket_name)
local_keys = set(local_objects.keys())
remote_keys = set(remote_objects.keys())
to_pull = []
for key in remote_keys:
remote_meta = remote_objects[key]
local_meta = local_objects.get(key)
if local_meta is None:
to_pull.append(key)
else:
resolution = self._resolve_conflict(local_meta, remote_meta)
if resolution == "pull":
to_pull.append(key)
stats.conflicts_resolved += 1
else:
stats.objects_skipped += 1
pulled_count = 0
for key in to_pull:
if self._shutdown.is_set():
break
if pulled_count >= self.batch_size:
break
remote_meta = remote_objects[key]
success = self._pull_object(rule, key, connection, remote_meta)
if success:
stats.objects_pulled += 1
pulled_count += 1
sync_state.synced_objects[key] = SyncedObjectInfo(
last_synced_at=time.time(),
remote_etag=remote_meta.etag,
source="remote",
)
else:
stats.errors += 1
if rule.sync_deletions:
for key in list(sync_state.synced_objects.keys()):
if key not in remote_keys and key in local_keys:
tracked = sync_state.synced_objects[key]
if tracked.source == "remote":
local_meta = local_objects.get(key)
if local_meta and local_meta.last_modified.timestamp() <= tracked.last_synced_at:
success = self._apply_remote_deletion(rule.bucket_name, key)
if success:
stats.deletions_applied += 1
del sync_state.synced_objects[key]
sync_state.last_full_sync = time.time()
self._save_sync_state(rule.bucket_name, sync_state)
with self.replication_manager._stats_lock:
rule.last_pull_at = time.time()
self.replication_manager.save_rules()
stats.last_sync_at = time.time()
logger.info(
"Site sync completed for %s: pulled=%d, skipped=%d, conflicts=%d, deletions=%d, errors=%d",
rule.bucket_name,
stats.objects_pulled,
stats.objects_skipped,
stats.conflicts_resolved,
stats.deletions_applied,
stats.errors,
)
return stats
def _list_local_objects(self, bucket_name: str) -> Dict[str, Any]:
from .storage import ObjectMeta
objects = self.storage.list_objects_all(bucket_name)
return {obj.key: obj for obj in objects}
def _list_remote_objects(self, rule: "ReplicationRule", connection: "RemoteConnection") -> Dict[str, RemoteObjectMeta]:
s3 = self._create_client(connection)
result: Dict[str, RemoteObjectMeta] = {}
paginator = s3.get_paginator("list_objects_v2")
try:
for page in paginator.paginate(Bucket=rule.target_bucket):
for obj in page.get("Contents", []):
meta = RemoteObjectMeta.from_s3_object(obj)
result[meta.key] = meta
except ClientError as e:
if e.response["Error"]["Code"] == "NoSuchBucket":
return {}
raise
return result
def _resolve_conflict(self, local_meta: Any, remote_meta: RemoteObjectMeta) -> str:
local_ts = local_meta.last_modified.timestamp()
remote_ts = remote_meta.last_modified.timestamp()
if abs(remote_ts - local_ts) < self.clock_skew_tolerance_seconds:
local_etag = local_meta.etag or ""
if remote_meta.etag == local_etag:
return "skip"
return "pull" if remote_meta.etag > local_etag else "keep"
return "pull" if remote_ts > local_ts else "keep"
def _pull_object(
self,
rule: "ReplicationRule",
object_key: str,
connection: "RemoteConnection",
remote_meta: RemoteObjectMeta,
) -> bool:
s3 = self._create_client(connection)
tmp_path = None
try:
tmp_dir = self.storage_root / ".myfsio.sys" / "tmp"
tmp_dir.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile(dir=tmp_dir, delete=False) as tmp_file:
tmp_path = Path(tmp_file.name)
s3.download_file(rule.target_bucket, object_key, str(tmp_path))
head_response = s3.head_object(Bucket=rule.target_bucket, Key=object_key)
user_metadata = head_response.get("Metadata", {})
with open(tmp_path, "rb") as f:
self.storage.put_object(
rule.bucket_name,
object_key,
f,
metadata=user_metadata if user_metadata else None,
)
logger.debug("Pulled object %s/%s from remote", rule.bucket_name, object_key)
return True
except ClientError as e:
logger.error("Failed to pull %s/%s: %s", rule.bucket_name, object_key, e)
return False
except Exception as e:
logger.error("Failed to store pulled object %s/%s: %s", rule.bucket_name, object_key, e)
return False
finally:
if tmp_path and tmp_path.exists():
try:
tmp_path.unlink()
except OSError:
pass
def _apply_remote_deletion(self, bucket_name: str, object_key: str) -> bool:
try:
self.storage.delete_object(bucket_name, object_key)
logger.debug("Applied remote deletion for %s/%s", bucket_name, object_key)
return True
except Exception as e:
logger.error("Failed to apply remote deletion for %s/%s: %s", bucket_name, object_key, e)
return False
def _sync_state_path(self, bucket_name: str) -> Path:
return self.storage_root / ".myfsio.sys" / "buckets" / bucket_name / "site_sync_state.json"
def _load_sync_state(self, bucket_name: str) -> SyncState:
path = self._sync_state_path(bucket_name)
if not path.exists():
return SyncState()
try:
data = json.loads(path.read_text(encoding="utf-8"))
return SyncState.from_dict(data)
except (json.JSONDecodeError, OSError, KeyError) as e:
logger.warning("Failed to load sync state for %s: %s", bucket_name, e)
return SyncState()
def _save_sync_state(self, bucket_name: str, state: SyncState) -> None:
path = self._sync_state_path(bucket_name)
path.parent.mkdir(parents=True, exist_ok=True)
try:
path.write_text(json.dumps(state.to_dict(), indent=2), encoding="utf-8")
except OSError as e:
logger.warning("Failed to save sync state for %s: %s", bucket_name, e)

File diff suppressed because it is too large Load Diff

215
app/system_metrics.py Normal file
View File

@@ -0,0 +1,215 @@
from __future__ import annotations
import json
import logging
import threading
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, TYPE_CHECKING
import psutil
if TYPE_CHECKING:
from .storage import ObjectStorage
logger = logging.getLogger(__name__)
@dataclass
class SystemMetricsSnapshot:
timestamp: datetime
cpu_percent: float
memory_percent: float
disk_percent: float
storage_bytes: int
def to_dict(self) -> Dict[str, Any]:
return {
"timestamp": self.timestamp.strftime("%Y-%m-%dT%H:%M:%SZ"),
"cpu_percent": round(self.cpu_percent, 2),
"memory_percent": round(self.memory_percent, 2),
"disk_percent": round(self.disk_percent, 2),
"storage_bytes": self.storage_bytes,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SystemMetricsSnapshot":
timestamp_str = data["timestamp"]
if timestamp_str.endswith("Z"):
timestamp_str = timestamp_str[:-1] + "+00:00"
return cls(
timestamp=datetime.fromisoformat(timestamp_str),
cpu_percent=data.get("cpu_percent", 0.0),
memory_percent=data.get("memory_percent", 0.0),
disk_percent=data.get("disk_percent", 0.0),
storage_bytes=data.get("storage_bytes", 0),
)
class SystemMetricsCollector:
def __init__(
self,
storage_root: Path,
interval_minutes: int = 5,
retention_hours: int = 24,
):
self.storage_root = storage_root
self.interval_seconds = interval_minutes * 60
self.retention_hours = retention_hours
self._lock = threading.Lock()
self._shutdown = threading.Event()
self._snapshots: List[SystemMetricsSnapshot] = []
self._storage_ref: Optional["ObjectStorage"] = None
self._load_history()
self._snapshot_thread = threading.Thread(
target=self._snapshot_loop,
name="system-metrics-snapshot",
daemon=True,
)
self._snapshot_thread.start()
def set_storage(self, storage: "ObjectStorage") -> None:
with self._lock:
self._storage_ref = storage
def _config_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "metrics_history.json"
def _load_history(self) -> None:
config_path = self._config_path()
if not config_path.exists():
return
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
history_data = data.get("history", [])
self._snapshots = [SystemMetricsSnapshot.from_dict(s) for s in history_data]
self._prune_old_snapshots()
except (json.JSONDecodeError, OSError, KeyError) as e:
logger.warning(f"Failed to load system metrics history: {e}")
def _save_history(self) -> None:
config_path = self._config_path()
config_path.parent.mkdir(parents=True, exist_ok=True)
try:
data = {"history": [s.to_dict() for s in self._snapshots]}
config_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
except OSError as e:
logger.warning(f"Failed to save system metrics history: {e}")
def _prune_old_snapshots(self) -> None:
if not self._snapshots:
return
cutoff = datetime.now(timezone.utc).timestamp() - (self.retention_hours * 3600)
self._snapshots = [
s for s in self._snapshots if s.timestamp.timestamp() > cutoff
]
def _snapshot_loop(self) -> None:
while not self._shutdown.is_set():
self._shutdown.wait(timeout=self.interval_seconds)
if not self._shutdown.is_set():
self._take_snapshot()
def _take_snapshot(self) -> None:
try:
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage(str(self.storage_root))
storage_bytes = 0
with self._lock:
storage = self._storage_ref
if storage:
try:
buckets = storage.list_buckets()
for bucket in buckets:
stats = storage.bucket_stats(bucket.name, cache_ttl=60)
storage_bytes += stats.get("total_bytes", stats.get("bytes", 0))
except Exception as e:
logger.warning(f"Failed to collect bucket stats: {e}")
snapshot = SystemMetricsSnapshot(
timestamp=datetime.now(timezone.utc),
cpu_percent=cpu_percent,
memory_percent=memory.percent,
disk_percent=disk.percent,
storage_bytes=storage_bytes,
)
with self._lock:
self._snapshots.append(snapshot)
self._prune_old_snapshots()
self._save_history()
logger.debug(f"System metrics snapshot taken: CPU={cpu_percent:.1f}%, Memory={memory.percent:.1f}%")
except Exception as e:
logger.warning(f"Failed to take system metrics snapshot: {e}")
def get_current(self) -> Dict[str, Any]:
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage(str(self.storage_root))
boot_time = psutil.boot_time()
uptime_seconds = time.time() - boot_time
uptime_days = int(uptime_seconds / 86400)
total_buckets = 0
total_objects = 0
total_bytes_used = 0
total_versions = 0
with self._lock:
storage = self._storage_ref
if storage:
try:
buckets = storage.list_buckets()
total_buckets = len(buckets)
for bucket in buckets:
stats = storage.bucket_stats(bucket.name, cache_ttl=60)
total_objects += stats.get("total_objects", stats.get("objects", 0))
total_bytes_used += stats.get("total_bytes", stats.get("bytes", 0))
total_versions += stats.get("version_count", 0)
except Exception as e:
logger.warning(f"Failed to collect current bucket stats: {e}")
return {
"cpu_percent": round(cpu_percent, 2),
"memory": {
"total": memory.total,
"available": memory.available,
"used": memory.used,
"percent": round(memory.percent, 2),
},
"disk": {
"total": disk.total,
"free": disk.free,
"used": disk.used,
"percent": round(disk.percent, 2),
},
"app": {
"buckets": total_buckets,
"objects": total_objects,
"versions": total_versions,
"storage_bytes": total_bytes_used,
"uptime_days": uptime_days,
},
}
def get_history(self, hours: Optional[int] = None) -> List[Dict[str, Any]]:
with self._lock:
snapshots = list(self._snapshots)
if hours:
cutoff = datetime.now(timezone.utc).timestamp() - (hours * 3600)
snapshots = [s for s in snapshots if s.timestamp.timestamp() > cutoff]
return [s.to_dict() for s in snapshots]
def shutdown(self) -> None:
self._shutdown.set()
self._take_snapshot()
self._snapshot_thread.join(timeout=5.0)

3435
app/ui.py

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,6 @@
"""Central location for the application version string."""
from __future__ import annotations
APP_VERSION = "0.1.0"
APP_VERSION = "0.4.3"
def get_version() -> str:

108
app/website_domains.py Normal file
View File

@@ -0,0 +1,108 @@
from __future__ import annotations
import json
import re
import threading
from pathlib import Path
from typing import Dict, List, Optional
_DOMAIN_RE = re.compile(
r"^(?!-)[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*$"
)
def normalize_domain(raw: str) -> str:
raw = raw.strip().lower()
for prefix in ("https://", "http://"):
if raw.startswith(prefix):
raw = raw[len(prefix):]
raw = raw.split("/", 1)[0]
raw = raw.split("?", 1)[0]
raw = raw.split("#", 1)[0]
if ":" in raw:
raw = raw.rsplit(":", 1)[0]
return raw
def is_valid_domain(domain: str) -> bool:
if not domain or len(domain) > 253:
return False
return bool(_DOMAIN_RE.match(domain))
class WebsiteDomainStore:
def __init__(self, config_path: Path) -> None:
self.config_path = config_path
self._lock = threading.Lock()
self._domains: Dict[str, str] = {}
self._last_mtime: float = 0.0
self.reload()
def reload(self) -> None:
if not self.config_path.exists():
self._domains = {}
self._last_mtime = 0.0
return
try:
self._last_mtime = self.config_path.stat().st_mtime
with open(self.config_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
self._domains = {k.lower(): v for k, v in data.items()}
else:
self._domains = {}
except (OSError, json.JSONDecodeError):
self._domains = {}
def _maybe_reload(self) -> None:
try:
if self.config_path.exists():
mtime = self.config_path.stat().st_mtime
if mtime != self._last_mtime:
self._last_mtime = mtime
with open(self.config_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
self._domains = {k.lower(): v for k, v in data.items()}
else:
self._domains = {}
elif self._domains:
self._domains = {}
self._last_mtime = 0.0
except (OSError, json.JSONDecodeError):
pass
def _save(self) -> None:
self.config_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.config_path, "w", encoding="utf-8") as f:
json.dump(self._domains, f, indent=2)
self._last_mtime = self.config_path.stat().st_mtime
def list_all(self) -> List[Dict[str, str]]:
with self._lock:
self._maybe_reload()
return [{"domain": d, "bucket": b} for d, b in self._domains.items()]
def get_bucket(self, domain: str) -> Optional[str]:
with self._lock:
self._maybe_reload()
return self._domains.get(domain.lower())
def get_domains_for_bucket(self, bucket: str) -> List[str]:
with self._lock:
self._maybe_reload()
return [d for d, b in self._domains.items() if b == bucket]
def set_mapping(self, domain: str, bucket: str) -> None:
with self._lock:
self._domains[domain.lower()] = bucket
self._save()
def delete_mapping(self, domain: str) -> bool:
with self._lock:
key = domain.lower()
if key not in self._domains:
return False
del self._domains[key]
self._save()
return True

6
docker-entrypoint.sh Normal file
View File

@@ -0,0 +1,6 @@
#!/bin/sh
set -e
ENGINE="${ENGINE:-rust}"
exec python run.py --prod --engine "$ENGINE"

2367
docs.md

File diff suppressed because it is too large Load Diff

3443
myfsio-engine/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

45
myfsio-engine/Cargo.toml Normal file
View File

@@ -0,0 +1,45 @@
[workspace]
resolver = "2"
members = [
"crates/myfsio-common",
"crates/myfsio-auth",
"crates/myfsio-crypto",
"crates/myfsio-storage",
"crates/myfsio-xml",
"crates/myfsio-server",
]
[workspace.dependencies]
tokio = { version = "1", features = ["full"] }
axum = { version = "0.8" }
tower = { version = "0.5" }
tower-http = { version = "0.6", features = ["cors", "trace"] }
hyper = { version = "1" }
bytes = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
quick-xml = { version = "0.37", features = ["serialize"] }
hmac = "0.12"
sha2 = "0.10"
md-5 = "0.10"
hex = "0.4"
aes = "0.8"
aes-gcm = "0.10"
cbc = { version = "0.1", features = ["alloc"] }
hkdf = "0.12"
uuid = { version = "1", features = ["v4"] }
parking_lot = "0.12"
lru = "0.14"
percent-encoding = "2"
regex = "1"
unicode-normalization = "0.1"
tracing = "0.1"
tracing-subscriber = "0.3"
thiserror = "2"
chrono = { version = "0.4", features = ["serde"] }
base64 = "0.22"
tokio-util = { version = "0.7", features = ["io"] }
futures = "0.3"
dashmap = "6"
crc32fast = "1"
duckdb = { version = "1", features = ["bundled"] }

View File

@@ -0,0 +1,26 @@
[package]
name = "myfsio-auth"
version = "0.1.0"
edition = "2021"
[dependencies]
myfsio-common = { path = "../myfsio-common" }
hmac = { workspace = true }
sha2 = { workspace = true }
hex = { workspace = true }
aes = { workspace = true }
cbc = { workspace = true }
base64 = { workspace = true }
pbkdf2 = "0.12"
lru = { workspace = true }
parking_lot = { workspace = true }
percent-encoding = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
chrono = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
[dev-dependencies]
tempfile = "3"

View File

@@ -0,0 +1,80 @@
use aes::cipher::{block_padding::Pkcs7, BlockDecryptMut, KeyIvInit};
use base64::{engine::general_purpose::URL_SAFE, Engine};
use hmac::{Hmac, Mac};
use sha2::Sha256;
type Aes128CbcDec = cbc::Decryptor<aes::Aes128>;
type HmacSha256 = Hmac<Sha256>;
pub fn derive_fernet_key(secret: &str) -> String {
let mut derived = [0u8; 32];
pbkdf2::pbkdf2_hmac::<Sha256>(
secret.as_bytes(),
b"myfsio-iam-encryption",
100_000,
&mut derived,
);
URL_SAFE.encode(derived)
}
pub fn decrypt(key_b64: &str, token: &str) -> Result<Vec<u8>, &'static str> {
let key_bytes = URL_SAFE
.decode(key_b64)
.map_err(|_| "invalid fernet key base64")?;
if key_bytes.len() != 32 {
return Err("fernet key must be 32 bytes");
}
let signing_key = &key_bytes[..16];
let encryption_key = &key_bytes[16..];
let token_bytes = URL_SAFE
.decode(token)
.map_err(|_| "invalid fernet token base64")?;
if token_bytes.len() < 57 {
return Err("fernet token too short");
}
if token_bytes[0] != 0x80 {
return Err("invalid fernet version");
}
let hmac_offset = token_bytes.len() - 32;
let payload = &token_bytes[..hmac_offset];
let expected_hmac = &token_bytes[hmac_offset..];
let mut mac =
HmacSha256::new_from_slice(signing_key).map_err(|_| "hmac key error")?;
mac.update(payload);
mac.verify_slice(expected_hmac)
.map_err(|_| "HMAC verification failed")?;
let iv = &token_bytes[9..25];
let ciphertext = &token_bytes[25..hmac_offset];
let plaintext = Aes128CbcDec::new(encryption_key.into(), iv.into())
.decrypt_padded_vec_mut::<Pkcs7>(ciphertext)
.map_err(|_| "AES-CBC decryption failed")?;
Ok(plaintext)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_derive_fernet_key_format() {
let key = derive_fernet_key("test-secret");
let decoded = URL_SAFE.decode(&key).unwrap();
assert_eq!(decoded.len(), 32);
}
#[test]
fn test_roundtrip_with_python_compat() {
let key = derive_fernet_key("dev-secret-key");
let decoded = URL_SAFE.decode(&key).unwrap();
assert_eq!(decoded.len(), 32);
}
}

View File

@@ -0,0 +1,812 @@
use chrono::{DateTime, Utc};
use myfsio_common::types::Principal;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Instant, SystemTime};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IamConfig {
#[serde(default = "default_version")]
pub version: u32,
#[serde(default)]
pub users: Vec<IamUser>,
}
fn default_version() -> u32 {
2
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IamUser {
pub user_id: String,
pub display_name: String,
#[serde(default = "default_enabled")]
pub enabled: bool,
#[serde(default)]
pub expires_at: Option<String>,
#[serde(default)]
pub access_keys: Vec<AccessKey>,
#[serde(default)]
pub policies: Vec<IamPolicy>,
}
#[derive(Debug, Clone, Deserialize)]
struct RawIamConfig {
#[serde(default)]
pub users: Vec<RawIamUser>,
}
#[derive(Debug, Clone, Deserialize)]
struct RawIamUser {
pub user_id: Option<String>,
pub display_name: Option<String>,
#[serde(default = "default_enabled")]
pub enabled: bool,
#[serde(default)]
pub expires_at: Option<String>,
pub access_key: Option<String>,
pub secret_key: Option<String>,
#[serde(default)]
pub access_keys: Vec<AccessKey>,
#[serde(default)]
pub policies: Vec<IamPolicy>,
}
impl RawIamUser {
fn normalize(self) -> IamUser {
let mut access_keys = self.access_keys;
if access_keys.is_empty() {
if let (Some(ak), Some(sk)) = (self.access_key, self.secret_key) {
access_keys.push(AccessKey {
access_key: ak,
secret_key: sk,
status: "active".to_string(),
created_at: None,
});
}
}
let display_name = self.display_name.unwrap_or_else(|| {
access_keys.first().map(|k| k.access_key.clone()).unwrap_or_else(|| "unknown".to_string())
});
let user_id = self.user_id.unwrap_or_else(|| {
format!("u-{}", display_name.to_ascii_lowercase().replace(' ', "-"))
});
IamUser {
user_id,
display_name,
enabled: self.enabled,
expires_at: self.expires_at,
access_keys,
policies: self.policies,
}
}
}
fn default_enabled() -> bool {
true
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AccessKey {
pub access_key: String,
pub secret_key: String,
#[serde(default = "default_status")]
pub status: String,
#[serde(default)]
pub created_at: Option<String>,
}
fn default_status() -> String {
"active".to_string()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IamPolicy {
pub bucket: String,
pub actions: Vec<String>,
#[serde(default = "default_prefix")]
pub prefix: String,
}
fn default_prefix() -> String {
"*".to_string()
}
struct IamState {
key_secrets: HashMap<String, String>,
key_index: HashMap<String, String>,
key_status: HashMap<String, String>,
user_records: HashMap<String, IamUser>,
file_mtime: Option<SystemTime>,
last_check: Instant,
}
pub struct IamService {
config_path: PathBuf,
state: Arc<RwLock<IamState>>,
check_interval: std::time::Duration,
fernet_key: Option<String>,
}
impl IamService {
pub fn new(config_path: PathBuf) -> Self {
Self::new_with_secret(config_path, None)
}
pub fn new_with_secret(config_path: PathBuf, secret_key: Option<String>) -> Self {
let fernet_key = secret_key.map(|s| crate::fernet::derive_fernet_key(&s));
let service = Self {
config_path,
state: Arc::new(RwLock::new(IamState {
key_secrets: HashMap::new(),
key_index: HashMap::new(),
key_status: HashMap::new(),
user_records: HashMap::new(),
file_mtime: None,
last_check: Instant::now(),
})),
check_interval: std::time::Duration::from_secs(2),
fernet_key,
};
service.reload();
service
}
fn reload_if_needed(&self) {
{
let state = self.state.read();
if state.last_check.elapsed() < self.check_interval {
return;
}
}
let current_mtime = std::fs::metadata(&self.config_path)
.and_then(|m| m.modified())
.ok();
let needs_reload = {
let state = self.state.read();
match (&state.file_mtime, &current_mtime) {
(None, Some(_)) => true,
(Some(old), Some(new)) => old != new,
(Some(_), None) => true,
(None, None) => state.key_secrets.is_empty(),
}
};
if needs_reload {
self.reload();
}
self.state.write().last_check = Instant::now();
}
fn reload(&self) {
let content = match std::fs::read_to_string(&self.config_path) {
Ok(c) => c,
Err(e) => {
tracing::warn!("Failed to read IAM config {}: {}", self.config_path.display(), e);
return;
}
};
let raw = if content.starts_with("MYFSIO_IAM_ENC:") {
let encrypted_token = &content["MYFSIO_IAM_ENC:".len()..];
match &self.fernet_key {
Some(key) => match crate::fernet::decrypt(key, encrypted_token.trim()) {
Ok(plaintext) => match String::from_utf8(plaintext) {
Ok(s) => s,
Err(e) => {
tracing::error!("Decrypted IAM config is not valid UTF-8: {}", e);
return;
}
},
Err(e) => {
tracing::error!("Failed to decrypt IAM config: {}. SECRET_KEY may have changed.", e);
return;
}
},
None => {
tracing::error!("IAM config is encrypted but no SECRET_KEY configured");
return;
}
}
} else {
content
};
let raw_config: RawIamConfig = match serde_json::from_str(&raw) {
Ok(c) => c,
Err(e) => {
tracing::error!("Failed to parse IAM config: {}", e);
return;
}
};
let users: Vec<IamUser> = raw_config.users.into_iter().map(|u| u.normalize()).collect();
let mut key_secrets = HashMap::new();
let mut key_index = HashMap::new();
let mut key_status = HashMap::new();
let mut user_records = HashMap::new();
for user in &users {
user_records.insert(user.user_id.clone(), user.clone());
for ak in &user.access_keys {
key_secrets.insert(ak.access_key.clone(), ak.secret_key.clone());
key_index.insert(ak.access_key.clone(), user.user_id.clone());
key_status.insert(ak.access_key.clone(), ak.status.clone());
}
}
let file_mtime = std::fs::metadata(&self.config_path)
.and_then(|m| m.modified())
.ok();
let mut state = self.state.write();
state.key_secrets = key_secrets;
state.key_index = key_index;
state.key_status = key_status;
state.user_records = user_records;
state.file_mtime = file_mtime;
state.last_check = Instant::now();
tracing::info!("IAM config reloaded: {} users, {} keys",
users.len(),
state.key_secrets.len());
}
pub fn get_secret_key(&self, access_key: &str) -> Option<String> {
self.reload_if_needed();
let state = self.state.read();
let status = state.key_status.get(access_key)?;
if status != "active" {
return None;
}
let user_id = state.key_index.get(access_key)?;
let user = state.user_records.get(user_id)?;
if !user.enabled {
return None;
}
if let Some(ref expires_at) = user.expires_at {
if let Ok(exp) = expires_at.parse::<DateTime<Utc>>() {
if Utc::now() > exp {
return None;
}
}
}
state.key_secrets.get(access_key).cloned()
}
pub fn get_principal(&self, access_key: &str) -> Option<Principal> {
self.reload_if_needed();
let state = self.state.read();
let status = state.key_status.get(access_key)?;
if status != "active" {
return None;
}
let user_id = state.key_index.get(access_key)?;
let user = state.user_records.get(user_id)?;
if !user.enabled {
return None;
}
if let Some(ref expires_at) = user.expires_at {
if let Ok(exp) = expires_at.parse::<DateTime<Utc>>() {
if Utc::now() > exp {
return None;
}
}
}
let is_admin = user.policies.iter().any(|p| {
p.bucket == "*" && p.actions.iter().any(|a| a == "*")
});
Some(Principal::new(
access_key.to_string(),
user.user_id.clone(),
user.display_name.clone(),
is_admin,
))
}
pub fn authenticate(&self, access_key: &str, secret_key: &str) -> Option<Principal> {
let stored_secret = self.get_secret_key(access_key)?;
if !crate::sigv4::constant_time_compare(&stored_secret, secret_key) {
return None;
}
self.get_principal(access_key)
}
pub fn authorize(
&self,
principal: &Principal,
bucket_name: Option<&str>,
action: &str,
object_key: Option<&str>,
) -> bool {
self.reload_if_needed();
if principal.is_admin {
return true;
}
let normalized_bucket = bucket_name
.unwrap_or("*")
.trim()
.to_ascii_lowercase();
let normalized_action = action.trim().to_ascii_lowercase();
let state = self.state.read();
let user = match state.user_records.get(&principal.user_id) {
Some(u) => u,
None => return false,
};
if !user.enabled {
return false;
}
if let Some(ref expires_at) = user.expires_at {
if let Ok(exp) = expires_at.parse::<DateTime<Utc>>() {
if Utc::now() > exp {
return false;
}
}
}
for policy in &user.policies {
if !bucket_matches(&policy.bucket, &normalized_bucket) {
continue;
}
if !action_matches(&policy.actions, &normalized_action) {
continue;
}
if let Some(key) = object_key {
if !prefix_matches(&policy.prefix, key) {
continue;
}
}
return true;
}
false
}
pub async fn list_users(&self) -> Vec<serde_json::Value> {
self.reload_if_needed();
let state = self.state.read();
state
.user_records
.values()
.map(|u| {
serde_json::json!({
"user_id": u.user_id,
"display_name": u.display_name,
"enabled": u.enabled,
"access_keys": u.access_keys.iter().map(|k| {
serde_json::json!({
"access_key": k.access_key,
"status": k.status,
"created_at": k.created_at,
})
}).collect::<Vec<_>>(),
"policy_count": u.policies.len(),
})
})
.collect()
}
pub async fn get_user(&self, identifier: &str) -> Option<serde_json::Value> {
self.reload_if_needed();
let state = self.state.read();
let user = state
.user_records
.get(identifier)
.or_else(|| {
state.key_index.get(identifier).and_then(|uid| state.user_records.get(uid))
})?;
Some(serde_json::json!({
"user_id": user.user_id,
"display_name": user.display_name,
"enabled": user.enabled,
"expires_at": user.expires_at,
"access_keys": user.access_keys.iter().map(|k| {
serde_json::json!({
"access_key": k.access_key,
"status": k.status,
"created_at": k.created_at,
})
}).collect::<Vec<_>>(),
"policies": user.policies,
}))
}
pub async fn set_user_enabled(&self, identifier: &str, enabled: bool) -> Result<(), String> {
let content = std::fs::read_to_string(&self.config_path)
.map_err(|e| format!("Failed to read IAM config: {}", e))?;
let raw: RawIamConfig = serde_json::from_str(&content)
.map_err(|e| format!("Failed to parse IAM config: {}", e))?;
let mut config = IamConfig {
version: 2,
users: raw.users.into_iter().map(|u| u.normalize()).collect(),
};
let user = config
.users
.iter_mut()
.find(|u| {
u.user_id == identifier
|| u.access_keys.iter().any(|k| k.access_key == identifier)
})
.ok_or_else(|| "User not found".to_string())?;
user.enabled = enabled;
let json = serde_json::to_string_pretty(&config)
.map_err(|e| format!("Failed to serialize IAM config: {}", e))?;
std::fs::write(&self.config_path, json)
.map_err(|e| format!("Failed to write IAM config: {}", e))?;
self.reload();
Ok(())
}
pub fn get_user_policies(&self, identifier: &str) -> Option<Vec<serde_json::Value>> {
self.reload_if_needed();
let state = self.state.read();
let user = state
.user_records
.get(identifier)
.or_else(|| {
state.key_index.get(identifier).and_then(|uid| state.user_records.get(uid))
})?;
Some(
user.policies
.iter()
.map(|p| serde_json::to_value(p).unwrap_or_default())
.collect(),
)
}
pub fn create_access_key(&self, identifier: &str) -> Result<serde_json::Value, String> {
let content = std::fs::read_to_string(&self.config_path)
.map_err(|e| format!("Failed to read IAM config: {}", e))?;
let raw: RawIamConfig = serde_json::from_str(&content)
.map_err(|e| format!("Failed to parse IAM config: {}", e))?;
let mut config = IamConfig {
version: 2,
users: raw.users.into_iter().map(|u| u.normalize()).collect(),
};
let user = config
.users
.iter_mut()
.find(|u| {
u.user_id == identifier
|| u.access_keys.iter().any(|k| k.access_key == identifier)
})
.ok_or_else(|| format!("User '{}' not found", identifier))?;
let new_ak = format!("AK{}", uuid::Uuid::new_v4().simple());
let new_sk = format!("SK{}", uuid::Uuid::new_v4().simple());
let key = AccessKey {
access_key: new_ak.clone(),
secret_key: new_sk.clone(),
status: "active".to_string(),
created_at: Some(chrono::Utc::now().to_rfc3339()),
};
user.access_keys.push(key);
let json = serde_json::to_string_pretty(&config)
.map_err(|e| format!("Failed to serialize IAM config: {}", e))?;
std::fs::write(&self.config_path, json)
.map_err(|e| format!("Failed to write IAM config: {}", e))?;
self.reload();
Ok(serde_json::json!({
"access_key": new_ak,
"secret_key": new_sk,
}))
}
pub fn delete_access_key(&self, access_key: &str) -> Result<(), String> {
let content = std::fs::read_to_string(&self.config_path)
.map_err(|e| format!("Failed to read IAM config: {}", e))?;
let raw: RawIamConfig = serde_json::from_str(&content)
.map_err(|e| format!("Failed to parse IAM config: {}", e))?;
let mut config = IamConfig {
version: 2,
users: raw.users.into_iter().map(|u| u.normalize()).collect(),
};
let mut found = false;
for user in &mut config.users {
if user.access_keys.iter().any(|k| k.access_key == access_key) {
if user.access_keys.len() <= 1 {
return Err("Cannot delete the last access key".to_string());
}
user.access_keys.retain(|k| k.access_key != access_key);
found = true;
break;
}
}
if !found {
return Err(format!("Access key '{}' not found", access_key));
}
let json = serde_json::to_string_pretty(&config)
.map_err(|e| format!("Failed to serialize IAM config: {}", e))?;
std::fs::write(&self.config_path, json)
.map_err(|e| format!("Failed to write IAM config: {}", e))?;
self.reload();
Ok(())
}
}
fn bucket_matches(policy_bucket: &str, bucket: &str) -> bool {
let pb = policy_bucket.trim().to_ascii_lowercase();
pb == "*" || pb == bucket
}
fn action_matches(policy_actions: &[String], action: &str) -> bool {
for policy_action in policy_actions {
let pa = policy_action.trim().to_ascii_lowercase();
if pa == "*" || pa == action {
return true;
}
if pa == "iam:*" && action.starts_with("iam:") {
return true;
}
}
false
}
fn prefix_matches(policy_prefix: &str, object_key: &str) -> bool {
let p = policy_prefix.trim();
if p.is_empty() || p == "*" {
return true;
}
let base = p.trim_end_matches('*');
object_key.starts_with(base)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
fn test_iam_json() -> String {
serde_json::json!({
"version": 2,
"users": [{
"user_id": "u-test1234",
"display_name": "admin",
"enabled": true,
"access_keys": [{
"access_key": "AKIAIOSFODNN7EXAMPLE",
"secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
"status": "active",
"created_at": "2024-01-01T00:00:00Z"
}],
"policies": [{
"bucket": "*",
"actions": ["*"],
"prefix": "*"
}]
}]
})
.to_string()
}
#[test]
fn test_load_and_lookup() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(test_iam_json().as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
let secret = svc.get_secret_key("AKIAIOSFODNN7EXAMPLE");
assert_eq!(
secret.unwrap(),
"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
);
}
#[test]
fn test_get_principal() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(test_iam_json().as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
let principal = svc.get_principal("AKIAIOSFODNN7EXAMPLE").unwrap();
assert_eq!(principal.display_name, "admin");
assert_eq!(principal.user_id, "u-test1234");
assert!(principal.is_admin);
}
#[test]
fn test_authenticate_success() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(test_iam_json().as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
let principal = svc
.authenticate(
"AKIAIOSFODNN7EXAMPLE",
"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
)
.unwrap();
assert_eq!(principal.display_name, "admin");
}
#[test]
fn test_authenticate_wrong_secret() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(test_iam_json().as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
assert!(svc.authenticate("AKIAIOSFODNN7EXAMPLE", "wrongsecret").is_none());
}
#[test]
fn test_unknown_key_returns_none() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(test_iam_json().as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
assert!(svc.get_secret_key("NONEXISTENTKEY").is_none());
assert!(svc.get_principal("NONEXISTENTKEY").is_none());
}
#[test]
fn test_disabled_user() {
let json = serde_json::json!({
"version": 2,
"users": [{
"user_id": "u-disabled",
"display_name": "disabled-user",
"enabled": false,
"access_keys": [{
"access_key": "DISABLED_KEY",
"secret_key": "secret123",
"status": "active"
}],
"policies": []
}]
})
.to_string();
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(json.as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
assert!(svc.get_secret_key("DISABLED_KEY").is_none());
}
#[test]
fn test_inactive_key() {
let json = serde_json::json!({
"version": 2,
"users": [{
"user_id": "u-test",
"display_name": "test",
"enabled": true,
"access_keys": [{
"access_key": "INACTIVE_KEY",
"secret_key": "secret123",
"status": "inactive"
}],
"policies": []
}]
})
.to_string();
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(json.as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
assert!(svc.get_secret_key("INACTIVE_KEY").is_none());
}
#[test]
fn test_v1_flat_format() {
let json = serde_json::json!({
"users": [{
"access_key": "test",
"secret_key": "secret",
"display_name": "Test User",
"policies": [{"bucket": "*", "actions": ["*"], "prefix": "*"}]
}]
})
.to_string();
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(json.as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
let secret = svc.get_secret_key("test");
assert_eq!(secret.unwrap(), "secret");
let principal = svc.get_principal("test").unwrap();
assert_eq!(principal.display_name, "Test User");
assert!(principal.is_admin);
}
#[test]
fn test_authorize_allows_matching_policy() {
let json = serde_json::json!({
"version": 2,
"users": [{
"user_id": "u-reader",
"display_name": "reader",
"enabled": true,
"access_keys": [{
"access_key": "READER_KEY",
"secret_key": "reader-secret",
"status": "active"
}],
"policies": [{
"bucket": "docs",
"actions": ["read"],
"prefix": "reports/"
}]
}]
})
.to_string();
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(json.as_bytes()).unwrap();
tmp.flush().unwrap();
let svc = IamService::new(tmp.path().to_path_buf());
let principal = svc.get_principal("READER_KEY").unwrap();
assert!(svc.authorize(
&principal,
Some("docs"),
"read",
Some("reports/2026.csv"),
));
assert!(!svc.authorize(
&principal,
Some("docs"),
"write",
Some("reports/2026.csv"),
));
assert!(!svc.authorize(
&principal,
Some("docs"),
"read",
Some("private/2026.csv"),
));
assert!(!svc.authorize(
&principal,
Some("other"),
"read",
Some("reports/2026.csv"),
));
}
}

View File

@@ -0,0 +1,4 @@
pub mod sigv4;
pub mod principal;
pub mod iam;
mod fernet;

View File

@@ -0,0 +1 @@
pub use myfsio_common::types::Principal;

View File

@@ -0,0 +1,258 @@
use hmac::{Hmac, Mac};
use lru::LruCache;
use parking_lot::Mutex;
use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC};
use sha2::{Digest, Sha256};
use std::num::NonZeroUsize;
use std::sync::LazyLock;
use std::time::Instant;
type HmacSha256 = Hmac<Sha256>;
struct CacheEntry {
key: Vec<u8>,
created: Instant,
}
static SIGNING_KEY_CACHE: LazyLock<Mutex<LruCache<(String, String, String, String), CacheEntry>>> =
LazyLock::new(|| Mutex::new(LruCache::new(NonZeroUsize::new(256).unwrap())));
const CACHE_TTL_SECS: u64 = 60;
const AWS_ENCODE_SET: &AsciiSet = &NON_ALPHANUMERIC
.remove(b'-')
.remove(b'_')
.remove(b'.')
.remove(b'~');
fn hmac_sha256(key: &[u8], msg: &[u8]) -> Vec<u8> {
let mut mac = HmacSha256::new_from_slice(key).expect("HMAC key length is always valid");
mac.update(msg);
mac.finalize().into_bytes().to_vec()
}
fn sha256_hex(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
hex::encode(hasher.finalize())
}
fn aws_uri_encode(input: &str) -> String {
percent_encode(input.as_bytes(), AWS_ENCODE_SET).to_string()
}
pub fn derive_signing_key_cached(
secret_key: &str,
date_stamp: &str,
region: &str,
service: &str,
) -> Vec<u8> {
let cache_key = (
secret_key.to_owned(),
date_stamp.to_owned(),
region.to_owned(),
service.to_owned(),
);
{
let mut cache = SIGNING_KEY_CACHE.lock();
if let Some(entry) = cache.get(&cache_key) {
if entry.created.elapsed().as_secs() < CACHE_TTL_SECS {
return entry.key.clone();
}
cache.pop(&cache_key);
}
}
let k_date = hmac_sha256(format!("AWS4{}", secret_key).as_bytes(), date_stamp.as_bytes());
let k_region = hmac_sha256(&k_date, region.as_bytes());
let k_service = hmac_sha256(&k_region, service.as_bytes());
let k_signing = hmac_sha256(&k_service, b"aws4_request");
{
let mut cache = SIGNING_KEY_CACHE.lock();
cache.put(
cache_key,
CacheEntry {
key: k_signing.clone(),
created: Instant::now(),
},
);
}
k_signing
}
fn constant_time_compare_inner(a: &[u8], b: &[u8]) -> bool {
if a.len() != b.len() {
return false;
}
let mut result: u8 = 0;
for (x, y) in a.iter().zip(b.iter()) {
result |= x ^ y;
}
result == 0
}
pub fn verify_sigv4_signature(
method: &str,
canonical_uri: &str,
query_params: &[(String, String)],
signed_headers_str: &str,
header_values: &[(String, String)],
payload_hash: &str,
amz_date: &str,
date_stamp: &str,
region: &str,
service: &str,
secret_key: &str,
provided_signature: &str,
) -> bool {
let mut sorted_params = query_params.to_vec();
sorted_params.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)));
let canonical_query_string = sorted_params
.iter()
.map(|(k, v)| format!("{}={}", aws_uri_encode(k), aws_uri_encode(v)))
.collect::<Vec<_>>()
.join("&");
let mut canonical_headers = String::new();
for (name, value) in header_values {
let lower_name = name.to_lowercase();
let normalized = value.split_whitespace().collect::<Vec<_>>().join(" ");
let final_value = if lower_name == "expect" && normalized.is_empty() {
"100-continue"
} else {
&normalized
};
canonical_headers.push_str(&lower_name);
canonical_headers.push(':');
canonical_headers.push_str(final_value);
canonical_headers.push('\n');
}
let canonical_request = format!(
"{}\n{}\n{}\n{}\n{}\n{}",
method, canonical_uri, canonical_query_string, canonical_headers, signed_headers_str,
payload_hash
);
let credential_scope = format!("{}/{}/{}/aws4_request", date_stamp, region, service);
let cr_hash = sha256_hex(canonical_request.as_bytes());
let string_to_sign = format!(
"AWS4-HMAC-SHA256\n{}\n{}\n{}",
amz_date, credential_scope, cr_hash
);
let signing_key = derive_signing_key_cached(secret_key, date_stamp, region, service);
let calculated = hmac_sha256(&signing_key, string_to_sign.as_bytes());
let calculated_hex = hex::encode(&calculated);
constant_time_compare_inner(calculated_hex.as_bytes(), provided_signature.as_bytes())
}
pub fn derive_signing_key(
secret_key: &str,
date_stamp: &str,
region: &str,
service: &str,
) -> Vec<u8> {
derive_signing_key_cached(secret_key, date_stamp, region, service)
}
pub fn compute_signature(signing_key: &[u8], string_to_sign: &str) -> String {
let sig = hmac_sha256(signing_key, string_to_sign.as_bytes());
hex::encode(sig)
}
pub fn build_string_to_sign(
amz_date: &str,
credential_scope: &str,
canonical_request: &str,
) -> String {
let cr_hash = sha256_hex(canonical_request.as_bytes());
format!(
"AWS4-HMAC-SHA256\n{}\n{}\n{}",
amz_date, credential_scope, cr_hash
)
}
pub fn constant_time_compare(a: &str, b: &str) -> bool {
constant_time_compare_inner(a.as_bytes(), b.as_bytes())
}
pub fn clear_signing_key_cache() {
SIGNING_KEY_CACHE.lock().clear();
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_derive_signing_key() {
let key = derive_signing_key("wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "20130524", "us-east-1", "s3");
assert_eq!(key.len(), 32);
}
#[test]
fn test_derive_signing_key_cached() {
let key1 = derive_signing_key("secret", "20240101", "us-east-1", "s3");
let key2 = derive_signing_key("secret", "20240101", "us-east-1", "s3");
assert_eq!(key1, key2);
}
#[test]
fn test_constant_time_compare() {
assert!(constant_time_compare("abc", "abc"));
assert!(!constant_time_compare("abc", "abd"));
assert!(!constant_time_compare("abc", "abcd"));
}
#[test]
fn test_build_string_to_sign() {
let result = build_string_to_sign("20130524T000000Z", "20130524/us-east-1/s3/aws4_request", "GET\n/\n\nhost:example.com\n\nhost\nUNSIGNED-PAYLOAD");
assert!(result.starts_with("AWS4-HMAC-SHA256\n"));
assert!(result.contains("20130524T000000Z"));
}
#[test]
fn test_aws_uri_encode() {
assert_eq!(aws_uri_encode("hello world"), "hello%20world");
assert_eq!(aws_uri_encode("test-file_name.txt"), "test-file_name.txt");
assert_eq!(aws_uri_encode("a/b"), "a%2Fb");
}
#[test]
fn test_verify_sigv4_roundtrip() {
let secret = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY";
let date_stamp = "20130524";
let region = "us-east-1";
let service = "s3";
let amz_date = "20130524T000000Z";
let signing_key = derive_signing_key(secret, date_stamp, region, service);
let canonical_request = "GET\n/\n\nhost:examplebucket.s3.amazonaws.com\n\nhost\nUNSIGNED-PAYLOAD";
let string_to_sign = build_string_to_sign(amz_date, &format!("{}/{}/{}/aws4_request", date_stamp, region, service), canonical_request);
let signature = compute_signature(&signing_key, &string_to_sign);
let result = verify_sigv4_signature(
"GET",
"/",
&[],
"host",
&[("host".to_string(), "examplebucket.s3.amazonaws.com".to_string())],
"UNSIGNED-PAYLOAD",
amz_date,
date_stamp,
region,
service,
secret,
&signature,
);
assert!(result);
}
}

View File

@@ -0,0 +1,11 @@
[package]
name = "myfsio-common"
version = "0.1.0"
edition = "2021"
[dependencies]
thiserror = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
chrono = { workspace = true }
uuid = { workspace = true }

View File

@@ -0,0 +1,20 @@
pub const SYSTEM_ROOT: &str = ".myfsio.sys";
pub const SYSTEM_BUCKETS_DIR: &str = "buckets";
pub const SYSTEM_MULTIPART_DIR: &str = "multipart";
pub const BUCKET_META_DIR: &str = "meta";
pub const BUCKET_VERSIONS_DIR: &str = "versions";
pub const BUCKET_CONFIG_FILE: &str = ".bucket.json";
pub const STATS_FILE: &str = "stats.json";
pub const ETAG_INDEX_FILE: &str = "etag_index.json";
pub const INDEX_FILE: &str = "_index.json";
pub const MANIFEST_FILE: &str = "manifest.json";
pub const INTERNAL_FOLDERS: &[&str] = &[".meta", ".versions", ".multipart"];
pub const DEFAULT_REGION: &str = "us-east-1";
pub const AWS_SERVICE: &str = "s3";
pub const DEFAULT_MAX_KEYS: usize = 1000;
pub const DEFAULT_OBJECT_KEY_MAX_BYTES: usize = 1024;
pub const DEFAULT_CHUNK_SIZE: usize = 65536;
pub const STREAM_CHUNK_SIZE: usize = 1_048_576;

View File

@@ -0,0 +1,221 @@
use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum S3ErrorCode {
AccessDenied,
BucketAlreadyExists,
BucketNotEmpty,
EntityTooLarge,
InternalError,
InvalidAccessKeyId,
InvalidArgument,
InvalidBucketName,
InvalidKey,
InvalidRange,
InvalidRequest,
MalformedXML,
MethodNotAllowed,
NoSuchBucket,
NoSuchKey,
NoSuchUpload,
NoSuchVersion,
NoSuchTagSet,
PreconditionFailed,
NotModified,
QuotaExceeded,
SignatureDoesNotMatch,
SlowDown,
}
impl S3ErrorCode {
pub fn http_status(&self) -> u16 {
match self {
Self::AccessDenied => 403,
Self::BucketAlreadyExists => 409,
Self::BucketNotEmpty => 409,
Self::EntityTooLarge => 413,
Self::InternalError => 500,
Self::InvalidAccessKeyId => 403,
Self::InvalidArgument => 400,
Self::InvalidBucketName => 400,
Self::InvalidKey => 400,
Self::InvalidRange => 416,
Self::InvalidRequest => 400,
Self::MalformedXML => 400,
Self::MethodNotAllowed => 405,
Self::NoSuchBucket => 404,
Self::NoSuchKey => 404,
Self::NoSuchUpload => 404,
Self::NoSuchVersion => 404,
Self::NoSuchTagSet => 404,
Self::PreconditionFailed => 412,
Self::NotModified => 304,
Self::QuotaExceeded => 403,
Self::SignatureDoesNotMatch => 403,
Self::SlowDown => 429,
}
}
pub fn as_str(&self) -> &'static str {
match self {
Self::AccessDenied => "AccessDenied",
Self::BucketAlreadyExists => "BucketAlreadyExists",
Self::BucketNotEmpty => "BucketNotEmpty",
Self::EntityTooLarge => "EntityTooLarge",
Self::InternalError => "InternalError",
Self::InvalidAccessKeyId => "InvalidAccessKeyId",
Self::InvalidArgument => "InvalidArgument",
Self::InvalidBucketName => "InvalidBucketName",
Self::InvalidKey => "InvalidKey",
Self::InvalidRange => "InvalidRange",
Self::InvalidRequest => "InvalidRequest",
Self::MalformedXML => "MalformedXML",
Self::MethodNotAllowed => "MethodNotAllowed",
Self::NoSuchBucket => "NoSuchBucket",
Self::NoSuchKey => "NoSuchKey",
Self::NoSuchUpload => "NoSuchUpload",
Self::NoSuchVersion => "NoSuchVersion",
Self::NoSuchTagSet => "NoSuchTagSet",
Self::PreconditionFailed => "PreconditionFailed",
Self::NotModified => "NotModified",
Self::QuotaExceeded => "QuotaExceeded",
Self::SignatureDoesNotMatch => "SignatureDoesNotMatch",
Self::SlowDown => "SlowDown",
}
}
pub fn default_message(&self) -> &'static str {
match self {
Self::AccessDenied => "Access Denied",
Self::BucketAlreadyExists => "The requested bucket name is not available",
Self::BucketNotEmpty => "The bucket you tried to delete is not empty",
Self::EntityTooLarge => "Your proposed upload exceeds the maximum allowed size",
Self::InternalError => "We encountered an internal error. Please try again.",
Self::InvalidAccessKeyId => "The access key ID you provided does not exist",
Self::InvalidArgument => "Invalid argument",
Self::InvalidBucketName => "The specified bucket is not valid",
Self::InvalidKey => "The specified key is not valid",
Self::InvalidRange => "The requested range is not satisfiable",
Self::InvalidRequest => "Invalid request",
Self::MalformedXML => "The XML you provided was not well-formed",
Self::MethodNotAllowed => "The specified method is not allowed against this resource",
Self::NoSuchBucket => "The specified bucket does not exist",
Self::NoSuchKey => "The specified key does not exist",
Self::NoSuchUpload => "The specified multipart upload does not exist",
Self::NoSuchVersion => "The specified version does not exist",
Self::NoSuchTagSet => "The TagSet does not exist",
Self::PreconditionFailed => "At least one of the preconditions you specified did not hold",
Self::NotModified => "Not Modified",
Self::QuotaExceeded => "The bucket quota has been exceeded",
Self::SignatureDoesNotMatch => "The request signature we calculated does not match the signature you provided",
Self::SlowDown => "Please reduce your request rate",
}
}
}
impl fmt::Display for S3ErrorCode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone)]
pub struct S3Error {
pub code: S3ErrorCode,
pub message: String,
pub resource: String,
pub request_id: String,
}
impl S3Error {
pub fn new(code: S3ErrorCode, message: impl Into<String>) -> Self {
Self {
code,
message: message.into(),
resource: String::new(),
request_id: String::new(),
}
}
pub fn from_code(code: S3ErrorCode) -> Self {
Self::new(code, code.default_message())
}
pub fn with_resource(mut self, resource: impl Into<String>) -> Self {
self.resource = resource.into();
self
}
pub fn with_request_id(mut self, request_id: impl Into<String>) -> Self {
self.request_id = request_id.into();
self
}
pub fn http_status(&self) -> u16 {
self.code.http_status()
}
pub fn to_xml(&self) -> String {
format!(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\
<Error>\
<Code>{}</Code>\
<Message>{}</Message>\
<Resource>{}</Resource>\
<RequestId>{}</RequestId>\
</Error>",
self.code.as_str(),
xml_escape(&self.message),
xml_escape(&self.resource),
xml_escape(&self.request_id),
)
}
}
impl fmt::Display for S3Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}: {}", self.code, self.message)
}
}
impl std::error::Error for S3Error {}
fn xml_escape(s: &str) -> String {
s.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('>', "&gt;")
.replace('"', "&quot;")
.replace('\'', "&apos;")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_error_codes() {
assert_eq!(S3ErrorCode::NoSuchKey.http_status(), 404);
assert_eq!(S3ErrorCode::AccessDenied.http_status(), 403);
assert_eq!(S3ErrorCode::NoSuchBucket.as_str(), "NoSuchBucket");
}
#[test]
fn test_error_to_xml() {
let err = S3Error::from_code(S3ErrorCode::NoSuchKey)
.with_resource("/test-bucket/test-key")
.with_request_id("abc123");
let xml = err.to_xml();
assert!(xml.contains("<Code>NoSuchKey</Code>"));
assert!(xml.contains("<Resource>/test-bucket/test-key</Resource>"));
assert!(xml.contains("<RequestId>abc123</RequestId>"));
}
#[test]
fn test_xml_escape() {
let err = S3Error::new(S3ErrorCode::InvalidArgument, "key <test> & \"value\"")
.with_resource("/bucket/key&amp");
let xml = err.to_xml();
assert!(xml.contains("&lt;test&gt;"));
assert!(xml.contains("&amp;"));
}
}

View File

@@ -0,0 +1,3 @@
pub mod constants;
pub mod error;
pub mod types;

View File

@@ -0,0 +1,176 @@
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ObjectMeta {
pub key: String,
pub size: u64,
pub last_modified: DateTime<Utc>,
pub etag: Option<String>,
pub content_type: Option<String>,
pub storage_class: Option<String>,
pub metadata: HashMap<String, String>,
}
impl ObjectMeta {
pub fn new(key: String, size: u64, last_modified: DateTime<Utc>) -> Self {
Self {
key,
size,
last_modified,
etag: None,
content_type: None,
storage_class: Some("STANDARD".to_string()),
metadata: HashMap::new(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BucketMeta {
pub name: String,
pub creation_date: DateTime<Utc>,
}
#[derive(Debug, Clone, Default)]
pub struct BucketStats {
pub objects: u64,
pub bytes: u64,
pub version_count: u64,
pub version_bytes: u64,
}
impl BucketStats {
pub fn total_objects(&self) -> u64 {
self.objects + self.version_count
}
pub fn total_bytes(&self) -> u64 {
self.bytes + self.version_bytes
}
}
#[derive(Debug, Clone)]
pub struct ListObjectsResult {
pub objects: Vec<ObjectMeta>,
pub is_truncated: bool,
pub next_continuation_token: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ShallowListResult {
pub objects: Vec<ObjectMeta>,
pub common_prefixes: Vec<String>,
pub is_truncated: bool,
pub next_continuation_token: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct ListParams {
pub max_keys: usize,
pub continuation_token: Option<String>,
pub prefix: Option<String>,
pub start_after: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct ShallowListParams {
pub prefix: String,
pub delimiter: String,
pub max_keys: usize,
pub continuation_token: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartMeta {
pub part_number: u32,
pub etag: String,
pub size: u64,
pub last_modified: Option<DateTime<Utc>>,
}
#[derive(Debug, Clone)]
pub struct PartInfo {
pub part_number: u32,
pub etag: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MultipartUploadInfo {
pub upload_id: String,
pub key: String,
pub initiated: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VersionInfo {
pub version_id: String,
pub key: String,
pub size: u64,
pub last_modified: DateTime<Utc>,
pub etag: Option<String>,
pub is_latest: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Tag {
pub key: String,
pub value: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct BucketConfig {
#[serde(default)]
pub versioning_enabled: bool,
#[serde(default)]
pub tags: Vec<Tag>,
#[serde(default)]
pub cors: Option<serde_json::Value>,
#[serde(default)]
pub encryption: Option<serde_json::Value>,
#[serde(default)]
pub lifecycle: Option<serde_json::Value>,
#[serde(default)]
pub website: Option<serde_json::Value>,
#[serde(default)]
pub quota: Option<QuotaConfig>,
#[serde(default)]
pub acl: Option<serde_json::Value>,
#[serde(default)]
pub notification: Option<serde_json::Value>,
#[serde(default)]
pub logging: Option<serde_json::Value>,
#[serde(default)]
pub object_lock: Option<serde_json::Value>,
#[serde(default)]
pub policy: Option<serde_json::Value>,
#[serde(default)]
pub replication: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QuotaConfig {
pub max_bytes: Option<u64>,
pub max_objects: Option<u64>,
}
#[derive(Debug, Clone)]
pub struct Principal {
pub access_key: String,
pub user_id: String,
pub display_name: String,
pub is_admin: bool,
}
impl Principal {
pub fn new(access_key: String, user_id: String, display_name: String, is_admin: bool) -> Self {
Self {
access_key,
user_id,
display_name,
is_admin,
}
}
}

View File

@@ -0,0 +1,24 @@
[package]
name = "myfsio-crypto"
version = "0.1.0"
edition = "2021"
[dependencies]
myfsio-common = { path = "../myfsio-common" }
md-5 = { workspace = true }
sha2 = { workspace = true }
hex = { workspace = true }
aes-gcm = { workspace = true }
hkdf = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }
base64 = { workspace = true }
rand = "0.8"
[dev-dependencies]
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
tempfile = "3"

View File

@@ -0,0 +1,238 @@
use aes_gcm::aead::Aead;
use aes_gcm::{Aes256Gcm, KeyInit, Nonce};
use hkdf::Hkdf;
use sha2::Sha256;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom, Write};
use std::path::Path;
use thiserror::Error;
const DEFAULT_CHUNK_SIZE: usize = 65536;
const HEADER_SIZE: usize = 4;
#[derive(Debug, Error)]
pub enum CryptoError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid key size: expected 32 bytes, got {0}")]
InvalidKeySize(usize),
#[error("Invalid nonce size: expected 12 bytes, got {0}")]
InvalidNonceSize(usize),
#[error("Encryption failed: {0}")]
EncryptionFailed(String),
#[error("Decryption failed at chunk {0}")]
DecryptionFailed(u32),
#[error("HKDF expand failed: {0}")]
HkdfFailed(String),
}
fn read_exact_chunk(reader: &mut impl Read, buf: &mut [u8]) -> std::io::Result<usize> {
let mut filled = 0;
while filled < buf.len() {
match reader.read(&mut buf[filled..]) {
Ok(0) => break,
Ok(n) => filled += n,
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
Ok(filled)
}
fn derive_chunk_nonce(base_nonce: &[u8], chunk_index: u32) -> Result<[u8; 12], CryptoError> {
let hkdf = Hkdf::<Sha256>::new(Some(base_nonce), b"chunk_nonce");
let mut okm = [0u8; 12];
hkdf.expand(&chunk_index.to_be_bytes(), &mut okm)
.map_err(|e| CryptoError::HkdfFailed(e.to_string()))?;
Ok(okm)
}
pub fn encrypt_stream_chunked(
input_path: &Path,
output_path: &Path,
key: &[u8],
base_nonce: &[u8],
chunk_size: Option<usize>,
) -> Result<u32, CryptoError> {
if key.len() != 32 {
return Err(CryptoError::InvalidKeySize(key.len()));
}
if base_nonce.len() != 12 {
return Err(CryptoError::InvalidNonceSize(base_nonce.len()));
}
let chunk_size = chunk_size.unwrap_or(DEFAULT_CHUNK_SIZE);
let key_arr: [u8; 32] = key.try_into().unwrap();
let nonce_arr: [u8; 12] = base_nonce.try_into().unwrap();
let cipher = Aes256Gcm::new(&key_arr.into());
let mut infile = File::open(input_path)?;
let mut outfile = File::create(output_path)?;
outfile.write_all(&[0u8; 4])?;
let mut buf = vec![0u8; chunk_size];
let mut chunk_index: u32 = 0;
loop {
let n = read_exact_chunk(&mut infile, &mut buf)?;
if n == 0 {
break;
}
let nonce_bytes = derive_chunk_nonce(&nonce_arr, chunk_index)?;
let nonce = Nonce::from_slice(&nonce_bytes);
let encrypted = cipher
.encrypt(nonce, &buf[..n])
.map_err(|e| CryptoError::EncryptionFailed(e.to_string()))?;
let size = encrypted.len() as u32;
outfile.write_all(&size.to_be_bytes())?;
outfile.write_all(&encrypted)?;
chunk_index += 1;
}
outfile.seek(SeekFrom::Start(0))?;
outfile.write_all(&chunk_index.to_be_bytes())?;
Ok(chunk_index)
}
pub fn decrypt_stream_chunked(
input_path: &Path,
output_path: &Path,
key: &[u8],
base_nonce: &[u8],
) -> Result<u32, CryptoError> {
if key.len() != 32 {
return Err(CryptoError::InvalidKeySize(key.len()));
}
if base_nonce.len() != 12 {
return Err(CryptoError::InvalidNonceSize(base_nonce.len()));
}
let key_arr: [u8; 32] = key.try_into().unwrap();
let nonce_arr: [u8; 12] = base_nonce.try_into().unwrap();
let cipher = Aes256Gcm::new(&key_arr.into());
let mut infile = File::open(input_path)?;
let mut outfile = File::create(output_path)?;
let mut header = [0u8; HEADER_SIZE];
infile.read_exact(&mut header)?;
let chunk_count = u32::from_be_bytes(header);
let mut size_buf = [0u8; HEADER_SIZE];
for chunk_index in 0..chunk_count {
infile.read_exact(&mut size_buf)?;
let chunk_size = u32::from_be_bytes(size_buf) as usize;
let mut encrypted = vec![0u8; chunk_size];
infile.read_exact(&mut encrypted)?;
let nonce_bytes = derive_chunk_nonce(&nonce_arr, chunk_index)?;
let nonce = Nonce::from_slice(&nonce_bytes);
let decrypted = cipher
.decrypt(nonce, encrypted.as_ref())
.map_err(|_| CryptoError::DecryptionFailed(chunk_index))?;
outfile.write_all(&decrypted)?;
}
Ok(chunk_count)
}
pub async fn encrypt_stream_chunked_async(
input_path: &Path,
output_path: &Path,
key: &[u8],
base_nonce: &[u8],
chunk_size: Option<usize>,
) -> Result<u32, CryptoError> {
let input_path = input_path.to_owned();
let output_path = output_path.to_owned();
let key = key.to_vec();
let base_nonce = base_nonce.to_vec();
tokio::task::spawn_blocking(move || {
encrypt_stream_chunked(&input_path, &output_path, &key, &base_nonce, chunk_size)
})
.await
.map_err(|e| CryptoError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))?
}
pub async fn decrypt_stream_chunked_async(
input_path: &Path,
output_path: &Path,
key: &[u8],
base_nonce: &[u8],
) -> Result<u32, CryptoError> {
let input_path = input_path.to_owned();
let output_path = output_path.to_owned();
let key = key.to_vec();
let base_nonce = base_nonce.to_vec();
tokio::task::spawn_blocking(move || {
decrypt_stream_chunked(&input_path, &output_path, &key, &base_nonce)
})
.await
.map_err(|e| CryptoError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))?
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write as IoWrite;
#[test]
fn test_encrypt_decrypt_roundtrip() {
let dir = tempfile::tempdir().unwrap();
let input = dir.path().join("input.bin");
let encrypted = dir.path().join("encrypted.bin");
let decrypted = dir.path().join("decrypted.bin");
let data = b"Hello, this is a test of AES-256-GCM chunked encryption!";
std::fs::File::create(&input).unwrap().write_all(data).unwrap();
let key = [0x42u8; 32];
let nonce = [0x01u8; 12];
let chunks = encrypt_stream_chunked(&input, &encrypted, &key, &nonce, Some(16)).unwrap();
assert!(chunks > 0);
let chunks2 = decrypt_stream_chunked(&encrypted, &decrypted, &key, &nonce).unwrap();
assert_eq!(chunks, chunks2);
let result = std::fs::read(&decrypted).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_invalid_key_size() {
let dir = tempfile::tempdir().unwrap();
let input = dir.path().join("input.bin");
std::fs::File::create(&input).unwrap().write_all(b"test").unwrap();
let result = encrypt_stream_chunked(&input, &dir.path().join("out"), &[0u8; 16], &[0u8; 12], None);
assert!(matches!(result, Err(CryptoError::InvalidKeySize(16))));
}
#[test]
fn test_wrong_key_fails_decrypt() {
let dir = tempfile::tempdir().unwrap();
let input = dir.path().join("input.bin");
let encrypted = dir.path().join("encrypted.bin");
let decrypted = dir.path().join("decrypted.bin");
std::fs::File::create(&input).unwrap().write_all(b"secret data").unwrap();
let key = [0x42u8; 32];
let nonce = [0x01u8; 12];
encrypt_stream_chunked(&input, &encrypted, &key, &nonce, None).unwrap();
let wrong_key = [0x43u8; 32];
let result = decrypt_stream_chunked(&encrypted, &decrypted, &wrong_key, &nonce);
assert!(matches!(result, Err(CryptoError::DecryptionFailed(_))));
}
}

View File

@@ -0,0 +1,375 @@
use base64::engine::general_purpose::STANDARD as B64;
use base64::Engine;
use rand::RngCore;
use std::collections::HashMap;
use std::path::Path;
use crate::aes_gcm::{
encrypt_stream_chunked, decrypt_stream_chunked, CryptoError,
};
use crate::kms::KmsService;
#[derive(Debug, Clone, PartialEq)]
pub enum SseAlgorithm {
Aes256,
AwsKms,
CustomerProvided,
}
impl SseAlgorithm {
pub fn as_str(&self) -> &'static str {
match self {
SseAlgorithm::Aes256 => "AES256",
SseAlgorithm::AwsKms => "aws:kms",
SseAlgorithm::CustomerProvided => "AES256",
}
}
}
#[derive(Debug, Clone)]
pub struct EncryptionContext {
pub algorithm: SseAlgorithm,
pub kms_key_id: Option<String>,
pub customer_key: Option<Vec<u8>>,
}
#[derive(Debug, Clone)]
pub struct EncryptionMetadata {
pub algorithm: String,
pub nonce: String,
pub encrypted_data_key: Option<String>,
pub kms_key_id: Option<String>,
}
impl EncryptionMetadata {
pub fn to_metadata_map(&self) -> HashMap<String, String> {
let mut map = HashMap::new();
map.insert(
"x-amz-server-side-encryption".to_string(),
self.algorithm.clone(),
);
map.insert("x-amz-encryption-nonce".to_string(), self.nonce.clone());
if let Some(ref dk) = self.encrypted_data_key {
map.insert("x-amz-encrypted-data-key".to_string(), dk.clone());
}
if let Some(ref kid) = self.kms_key_id {
map.insert("x-amz-encryption-key-id".to_string(), kid.clone());
}
map
}
pub fn from_metadata(meta: &HashMap<String, String>) -> Option<Self> {
let algorithm = meta.get("x-amz-server-side-encryption")?;
let nonce = meta.get("x-amz-encryption-nonce")?;
Some(Self {
algorithm: algorithm.clone(),
nonce: nonce.clone(),
encrypted_data_key: meta.get("x-amz-encrypted-data-key").cloned(),
kms_key_id: meta.get("x-amz-encryption-key-id").cloned(),
})
}
pub fn is_encrypted(meta: &HashMap<String, String>) -> bool {
meta.contains_key("x-amz-server-side-encryption")
}
pub fn clean_metadata(meta: &mut HashMap<String, String>) {
meta.remove("x-amz-server-side-encryption");
meta.remove("x-amz-encryption-nonce");
meta.remove("x-amz-encrypted-data-key");
meta.remove("x-amz-encryption-key-id");
}
}
pub struct EncryptionService {
master_key: [u8; 32],
kms: Option<std::sync::Arc<KmsService>>,
}
impl EncryptionService {
pub fn new(master_key: [u8; 32], kms: Option<std::sync::Arc<KmsService>>) -> Self {
Self { master_key, kms }
}
pub fn generate_data_key(&self) -> ([u8; 32], [u8; 12]) {
let mut data_key = [0u8; 32];
let mut nonce = [0u8; 12];
rand::thread_rng().fill_bytes(&mut data_key);
rand::thread_rng().fill_bytes(&mut nonce);
(data_key, nonce)
}
pub fn wrap_data_key(&self, data_key: &[u8; 32]) -> Result<String, CryptoError> {
use aes_gcm::aead::Aead;
use aes_gcm::{Aes256Gcm, KeyInit, Nonce};
let cipher = Aes256Gcm::new((&self.master_key).into());
let mut nonce_bytes = [0u8; 12];
rand::thread_rng().fill_bytes(&mut nonce_bytes);
let nonce = Nonce::from_slice(&nonce_bytes);
let encrypted = cipher
.encrypt(nonce, data_key.as_slice())
.map_err(|e| CryptoError::EncryptionFailed(e.to_string()))?;
let mut combined = Vec::with_capacity(12 + encrypted.len());
combined.extend_from_slice(&nonce_bytes);
combined.extend_from_slice(&encrypted);
Ok(B64.encode(&combined))
}
pub fn unwrap_data_key(&self, wrapped_b64: &str) -> Result<[u8; 32], CryptoError> {
use aes_gcm::aead::Aead;
use aes_gcm::{Aes256Gcm, KeyInit, Nonce};
let combined = B64.decode(wrapped_b64).map_err(|e| {
CryptoError::EncryptionFailed(format!("Bad wrapped key encoding: {}", e))
})?;
if combined.len() < 12 {
return Err(CryptoError::EncryptionFailed(
"Wrapped key too short".to_string(),
));
}
let (nonce_bytes, ciphertext) = combined.split_at(12);
let cipher = Aes256Gcm::new((&self.master_key).into());
let nonce = Nonce::from_slice(nonce_bytes);
let plaintext = cipher
.decrypt(nonce, ciphertext)
.map_err(|_| CryptoError::DecryptionFailed(0))?;
if plaintext.len() != 32 {
return Err(CryptoError::InvalidKeySize(plaintext.len()));
}
let mut key = [0u8; 32];
key.copy_from_slice(&plaintext);
Ok(key)
}
pub async fn encrypt_object(
&self,
input_path: &Path,
output_path: &Path,
ctx: &EncryptionContext,
) -> Result<EncryptionMetadata, CryptoError> {
let (data_key, nonce) = self.generate_data_key();
let (encrypted_data_key, kms_key_id) = match ctx.algorithm {
SseAlgorithm::Aes256 => {
let wrapped = self.wrap_data_key(&data_key)?;
(Some(wrapped), None)
}
SseAlgorithm::AwsKms => {
let kms = self
.kms
.as_ref()
.ok_or_else(|| CryptoError::EncryptionFailed("KMS not available".into()))?;
let kid = ctx
.kms_key_id
.as_ref()
.ok_or_else(|| CryptoError::EncryptionFailed("No KMS key ID".into()))?;
let ciphertext = kms.encrypt_data(kid, &data_key).await?;
(Some(B64.encode(&ciphertext)), Some(kid.clone()))
}
SseAlgorithm::CustomerProvided => {
(None, None)
}
};
let actual_key = if ctx.algorithm == SseAlgorithm::CustomerProvided {
let ck = ctx.customer_key.as_ref().ok_or_else(|| {
CryptoError::EncryptionFailed("No customer key provided".into())
})?;
if ck.len() != 32 {
return Err(CryptoError::InvalidKeySize(ck.len()));
}
let mut k = [0u8; 32];
k.copy_from_slice(ck);
k
} else {
data_key
};
let ip = input_path.to_owned();
let op = output_path.to_owned();
let ak = actual_key;
let n = nonce;
tokio::task::spawn_blocking(move || {
encrypt_stream_chunked(&ip, &op, &ak, &n, None)
})
.await
.map_err(|e| CryptoError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))??;
Ok(EncryptionMetadata {
algorithm: ctx.algorithm.as_str().to_string(),
nonce: B64.encode(nonce),
encrypted_data_key,
kms_key_id,
})
}
pub async fn decrypt_object(
&self,
input_path: &Path,
output_path: &Path,
enc_meta: &EncryptionMetadata,
customer_key: Option<&[u8]>,
) -> Result<(), CryptoError> {
let nonce_bytes = B64.decode(&enc_meta.nonce).map_err(|e| {
CryptoError::EncryptionFailed(format!("Bad nonce encoding: {}", e))
})?;
if nonce_bytes.len() != 12 {
return Err(CryptoError::InvalidNonceSize(nonce_bytes.len()));
}
let data_key: [u8; 32] = if let Some(ck) = customer_key {
if ck.len() != 32 {
return Err(CryptoError::InvalidKeySize(ck.len()));
}
let mut k = [0u8; 32];
k.copy_from_slice(ck);
k
} else if enc_meta.algorithm == "aws:kms" {
let kms = self
.kms
.as_ref()
.ok_or_else(|| CryptoError::EncryptionFailed("KMS not available".into()))?;
let kid = enc_meta
.kms_key_id
.as_ref()
.ok_or_else(|| CryptoError::EncryptionFailed("No KMS key ID in metadata".into()))?;
let encrypted_dk = enc_meta.encrypted_data_key.as_ref().ok_or_else(|| {
CryptoError::EncryptionFailed("No encrypted data key in metadata".into())
})?;
let ct = B64.decode(encrypted_dk).map_err(|e| {
CryptoError::EncryptionFailed(format!("Bad data key encoding: {}", e))
})?;
let dk = kms.decrypt_data(kid, &ct).await?;
if dk.len() != 32 {
return Err(CryptoError::InvalidKeySize(dk.len()));
}
let mut k = [0u8; 32];
k.copy_from_slice(&dk);
k
} else {
let wrapped = enc_meta.encrypted_data_key.as_ref().ok_or_else(|| {
CryptoError::EncryptionFailed("No encrypted data key in metadata".into())
})?;
self.unwrap_data_key(wrapped)?
};
let ip = input_path.to_owned();
let op = output_path.to_owned();
let nb: [u8; 12] = nonce_bytes.try_into().unwrap();
tokio::task::spawn_blocking(move || {
decrypt_stream_chunked(&ip, &op, &data_key, &nb)
})
.await
.map_err(|e| CryptoError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)))??;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
fn test_master_key() -> [u8; 32] {
[0x42u8; 32]
}
#[test]
fn test_wrap_unwrap_data_key() {
let svc = EncryptionService::new(test_master_key(), None);
let dk = [0xAAu8; 32];
let wrapped = svc.wrap_data_key(&dk).unwrap();
let unwrapped = svc.unwrap_data_key(&wrapped).unwrap();
assert_eq!(dk, unwrapped);
}
#[tokio::test]
async fn test_encrypt_decrypt_object_sse_s3() {
let dir = tempfile::tempdir().unwrap();
let input = dir.path().join("plain.bin");
let encrypted = dir.path().join("enc.bin");
let decrypted = dir.path().join("dec.bin");
let data = b"SSE-S3 encrypted content for testing!";
std::fs::File::create(&input).unwrap().write_all(data).unwrap();
let svc = EncryptionService::new(test_master_key(), None);
let ctx = EncryptionContext {
algorithm: SseAlgorithm::Aes256,
kms_key_id: None,
customer_key: None,
};
let meta = svc.encrypt_object(&input, &encrypted, &ctx).await.unwrap();
assert_eq!(meta.algorithm, "AES256");
assert!(meta.encrypted_data_key.is_some());
svc.decrypt_object(&encrypted, &decrypted, &meta, None)
.await
.unwrap();
let result = std::fs::read(&decrypted).unwrap();
assert_eq!(result, data);
}
#[tokio::test]
async fn test_encrypt_decrypt_object_sse_c() {
let dir = tempfile::tempdir().unwrap();
let input = dir.path().join("plain.bin");
let encrypted = dir.path().join("enc.bin");
let decrypted = dir.path().join("dec.bin");
let data = b"SSE-C encrypted content!";
std::fs::File::create(&input).unwrap().write_all(data).unwrap();
let customer_key = [0xBBu8; 32];
let svc = EncryptionService::new(test_master_key(), None);
let ctx = EncryptionContext {
algorithm: SseAlgorithm::CustomerProvided,
kms_key_id: None,
customer_key: Some(customer_key.to_vec()),
};
let meta = svc.encrypt_object(&input, &encrypted, &ctx).await.unwrap();
assert!(meta.encrypted_data_key.is_none());
svc.decrypt_object(&encrypted, &decrypted, &meta, Some(&customer_key))
.await
.unwrap();
let result = std::fs::read(&decrypted).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_encryption_metadata_roundtrip() {
let meta = EncryptionMetadata {
algorithm: "AES256".to_string(),
nonce: "dGVzdG5vbmNlMTI=".to_string(),
encrypted_data_key: Some("c29tZWtleQ==".to_string()),
kms_key_id: None,
};
let map = meta.to_metadata_map();
let restored = EncryptionMetadata::from_metadata(&map).unwrap();
assert_eq!(restored.algorithm, "AES256");
assert_eq!(restored.nonce, meta.nonce);
assert_eq!(restored.encrypted_data_key, meta.encrypted_data_key);
}
#[test]
fn test_is_encrypted() {
let mut meta = HashMap::new();
assert!(!EncryptionMetadata::is_encrypted(&meta));
meta.insert("x-amz-server-side-encryption".to_string(), "AES256".to_string());
assert!(EncryptionMetadata::is_encrypted(&meta));
}
}

View File

@@ -0,0 +1,132 @@
use md5::{Digest, Md5};
use sha2::Sha256;
use std::io::Read;
use std::path::Path;
const CHUNK_SIZE: usize = 65536;
pub fn md5_file(path: &Path) -> std::io::Result<String> {
let mut file = std::fs::File::open(path)?;
let mut hasher = Md5::new();
let mut buf = vec![0u8; CHUNK_SIZE];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(format!("{:x}", hasher.finalize()))
}
pub fn md5_bytes(data: &[u8]) -> String {
let mut hasher = Md5::new();
hasher.update(data);
format!("{:x}", hasher.finalize())
}
pub fn sha256_file(path: &Path) -> std::io::Result<String> {
let mut file = std::fs::File::open(path)?;
let mut hasher = Sha256::new();
let mut buf = vec![0u8; CHUNK_SIZE];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(format!("{:x}", hasher.finalize()))
}
pub fn sha256_bytes(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
format!("{:x}", hasher.finalize())
}
pub fn md5_sha256_file(path: &Path) -> std::io::Result<(String, String)> {
let mut file = std::fs::File::open(path)?;
let mut md5_hasher = Md5::new();
let mut sha_hasher = Sha256::new();
let mut buf = vec![0u8; CHUNK_SIZE];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
md5_hasher.update(&buf[..n]);
sha_hasher.update(&buf[..n]);
}
Ok((
format!("{:x}", md5_hasher.finalize()),
format!("{:x}", sha_hasher.finalize()),
))
}
pub async fn md5_file_async(path: &Path) -> std::io::Result<String> {
let path = path.to_owned();
tokio::task::spawn_blocking(move || md5_file(&path))
.await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?
}
pub async fn sha256_file_async(path: &Path) -> std::io::Result<String> {
let path = path.to_owned();
tokio::task::spawn_blocking(move || sha256_file(&path))
.await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?
}
pub async fn md5_sha256_file_async(path: &Path) -> std::io::Result<(String, String)> {
let path = path.to_owned();
tokio::task::spawn_blocking(move || md5_sha256_file(&path))
.await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn test_md5_bytes() {
assert_eq!(md5_bytes(b""), "d41d8cd98f00b204e9800998ecf8427e");
assert_eq!(md5_bytes(b"hello"), "5d41402abc4b2a76b9719d911017c592");
}
#[test]
fn test_sha256_bytes() {
let hash = sha256_bytes(b"hello");
assert_eq!(hash, "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824");
}
#[test]
fn test_md5_file() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(b"hello").unwrap();
tmp.flush().unwrap();
let hash = md5_file(tmp.path()).unwrap();
assert_eq!(hash, "5d41402abc4b2a76b9719d911017c592");
}
#[test]
fn test_md5_sha256_file() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(b"hello").unwrap();
tmp.flush().unwrap();
let (md5, sha) = md5_sha256_file(tmp.path()).unwrap();
assert_eq!(md5, "5d41402abc4b2a76b9719d911017c592");
assert_eq!(sha, "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824");
}
#[tokio::test]
async fn test_md5_file_async() {
let mut tmp = tempfile::NamedTempFile::new().unwrap();
tmp.write_all(b"hello").unwrap();
tmp.flush().unwrap();
let hash = md5_file_async(tmp.path()).await.unwrap();
assert_eq!(hash, "5d41402abc4b2a76b9719d911017c592");
}
}

View File

@@ -0,0 +1,453 @@
use aes_gcm::aead::Aead;
use aes_gcm::{Aes256Gcm, KeyInit, Nonce};
use base64::engine::general_purpose::STANDARD as B64;
use base64::Engine;
use chrono::{DateTime, Utc};
use rand::RngCore;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::sync::RwLock;
use crate::aes_gcm::CryptoError;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KmsKey {
#[serde(rename = "KeyId")]
pub key_id: String,
#[serde(rename = "Arn")]
pub arn: String,
#[serde(rename = "Description")]
pub description: String,
#[serde(rename = "CreationDate")]
pub creation_date: DateTime<Utc>,
#[serde(rename = "Enabled")]
pub enabled: bool,
#[serde(rename = "KeyState")]
pub key_state: String,
#[serde(rename = "KeyUsage")]
pub key_usage: String,
#[serde(rename = "KeySpec")]
pub key_spec: String,
#[serde(rename = "EncryptedKeyMaterial")]
pub encrypted_key_material: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct KmsStore {
keys: Vec<KmsKey>,
}
pub struct KmsService {
keys_path: PathBuf,
master_key: Arc<RwLock<[u8; 32]>>,
keys: Arc<RwLock<Vec<KmsKey>>>,
}
impl KmsService {
pub async fn new(keys_dir: &Path) -> Result<Self, CryptoError> {
std::fs::create_dir_all(keys_dir).map_err(CryptoError::Io)?;
let keys_path = keys_dir.join("kms_keys.json");
let master_key = Self::load_or_create_master_key(&keys_dir.join("kms_master.key"))?;
let keys = if keys_path.exists() {
let data = std::fs::read_to_string(&keys_path).map_err(CryptoError::Io)?;
let store: KmsStore = serde_json::from_str(&data)
.map_err(|e| CryptoError::EncryptionFailed(format!("Bad KMS store: {}", e)))?;
store.keys
} else {
Vec::new()
};
Ok(Self {
keys_path,
master_key: Arc::new(RwLock::new(master_key)),
keys: Arc::new(RwLock::new(keys)),
})
}
fn load_or_create_master_key(path: &Path) -> Result<[u8; 32], CryptoError> {
if path.exists() {
let encoded = std::fs::read_to_string(path).map_err(CryptoError::Io)?;
let decoded = B64.decode(encoded.trim()).map_err(|e| {
CryptoError::EncryptionFailed(format!("Bad master key encoding: {}", e))
})?;
if decoded.len() != 32 {
return Err(CryptoError::InvalidKeySize(decoded.len()));
}
let mut key = [0u8; 32];
key.copy_from_slice(&decoded);
Ok(key)
} else {
let mut key = [0u8; 32];
rand::thread_rng().fill_bytes(&mut key);
let encoded = B64.encode(key);
std::fs::write(path, &encoded).map_err(CryptoError::Io)?;
Ok(key)
}
}
fn encrypt_key_material(
master_key: &[u8; 32],
plaintext_key: &[u8],
) -> Result<String, CryptoError> {
let cipher = Aes256Gcm::new(master_key.into());
let mut nonce_bytes = [0u8; 12];
rand::thread_rng().fill_bytes(&mut nonce_bytes);
let nonce = Nonce::from_slice(&nonce_bytes);
let ciphertext = cipher
.encrypt(nonce, plaintext_key)
.map_err(|e| CryptoError::EncryptionFailed(e.to_string()))?;
let mut combined = Vec::with_capacity(12 + ciphertext.len());
combined.extend_from_slice(&nonce_bytes);
combined.extend_from_slice(&ciphertext);
Ok(B64.encode(&combined))
}
fn decrypt_key_material(
master_key: &[u8; 32],
encrypted_b64: &str,
) -> Result<Vec<u8>, CryptoError> {
let combined = B64.decode(encrypted_b64).map_err(|e| {
CryptoError::EncryptionFailed(format!("Bad key material encoding: {}", e))
})?;
if combined.len() < 12 {
return Err(CryptoError::EncryptionFailed(
"Encrypted key material too short".to_string(),
));
}
let (nonce_bytes, ciphertext) = combined.split_at(12);
let cipher = Aes256Gcm::new(master_key.into());
let nonce = Nonce::from_slice(nonce_bytes);
cipher
.decrypt(nonce, ciphertext)
.map_err(|_| CryptoError::DecryptionFailed(0))
}
async fn save(&self) -> Result<(), CryptoError> {
let keys = self.keys.read().await;
let store = KmsStore {
keys: keys.clone(),
};
let json = serde_json::to_string_pretty(&store)
.map_err(|e| CryptoError::EncryptionFailed(e.to_string()))?;
std::fs::write(&self.keys_path, json).map_err(CryptoError::Io)?;
Ok(())
}
pub async fn create_key(&self, description: &str) -> Result<KmsKey, CryptoError> {
let key_id = uuid::Uuid::new_v4().to_string();
let arn = format!("arn:aws:kms:local:000000000000:key/{}", key_id);
let mut plaintext_key = [0u8; 32];
rand::thread_rng().fill_bytes(&mut plaintext_key);
let master = self.master_key.read().await;
let encrypted = Self::encrypt_key_material(&master, &plaintext_key)?;
let kms_key = KmsKey {
key_id: key_id.clone(),
arn,
description: description.to_string(),
creation_date: Utc::now(),
enabled: true,
key_state: "Enabled".to_string(),
key_usage: "ENCRYPT_DECRYPT".to_string(),
key_spec: "SYMMETRIC_DEFAULT".to_string(),
encrypted_key_material: encrypted,
};
self.keys.write().await.push(kms_key.clone());
self.save().await?;
Ok(kms_key)
}
pub async fn list_keys(&self) -> Vec<KmsKey> {
self.keys.read().await.clone()
}
pub async fn get_key(&self, key_id: &str) -> Option<KmsKey> {
let keys = self.keys.read().await;
keys.iter()
.find(|k| k.key_id == key_id || k.arn == key_id)
.cloned()
}
pub async fn delete_key(&self, key_id: &str) -> Result<bool, CryptoError> {
let mut keys = self.keys.write().await;
let len_before = keys.len();
keys.retain(|k| k.key_id != key_id && k.arn != key_id);
let removed = keys.len() < len_before;
drop(keys);
if removed {
self.save().await?;
}
Ok(removed)
}
pub async fn enable_key(&self, key_id: &str) -> Result<bool, CryptoError> {
let mut keys = self.keys.write().await;
if let Some(key) = keys.iter_mut().find(|k| k.key_id == key_id) {
key.enabled = true;
key.key_state = "Enabled".to_string();
drop(keys);
self.save().await?;
Ok(true)
} else {
Ok(false)
}
}
pub async fn disable_key(&self, key_id: &str) -> Result<bool, CryptoError> {
let mut keys = self.keys.write().await;
if let Some(key) = keys.iter_mut().find(|k| k.key_id == key_id) {
key.enabled = false;
key.key_state = "Disabled".to_string();
drop(keys);
self.save().await?;
Ok(true)
} else {
Ok(false)
}
}
pub async fn decrypt_data_key(&self, key_id: &str) -> Result<Vec<u8>, CryptoError> {
let keys = self.keys.read().await;
let key = keys
.iter()
.find(|k| k.key_id == key_id || k.arn == key_id)
.ok_or_else(|| CryptoError::EncryptionFailed("KMS key not found".to_string()))?;
if !key.enabled {
return Err(CryptoError::EncryptionFailed(
"KMS key is disabled".to_string(),
));
}
let master = self.master_key.read().await;
Self::decrypt_key_material(&master, &key.encrypted_key_material)
}
pub async fn encrypt_data(
&self,
key_id: &str,
plaintext: &[u8],
) -> Result<Vec<u8>, CryptoError> {
let data_key = self.decrypt_data_key(key_id).await?;
if data_key.len() != 32 {
return Err(CryptoError::InvalidKeySize(data_key.len()));
}
let key_arr: [u8; 32] = data_key.try_into().unwrap();
let cipher = Aes256Gcm::new(&key_arr.into());
let mut nonce_bytes = [0u8; 12];
rand::thread_rng().fill_bytes(&mut nonce_bytes);
let nonce = Nonce::from_slice(&nonce_bytes);
let ciphertext = cipher
.encrypt(nonce, plaintext)
.map_err(|e| CryptoError::EncryptionFailed(e.to_string()))?;
let mut result = Vec::with_capacity(12 + ciphertext.len());
result.extend_from_slice(&nonce_bytes);
result.extend_from_slice(&ciphertext);
Ok(result)
}
pub async fn decrypt_data(
&self,
key_id: &str,
ciphertext: &[u8],
) -> Result<Vec<u8>, CryptoError> {
if ciphertext.len() < 12 {
return Err(CryptoError::EncryptionFailed(
"Ciphertext too short".to_string(),
));
}
let data_key = self.decrypt_data_key(key_id).await?;
if data_key.len() != 32 {
return Err(CryptoError::InvalidKeySize(data_key.len()));
}
let key_arr: [u8; 32] = data_key.try_into().unwrap();
let (nonce_bytes, ct) = ciphertext.split_at(12);
let cipher = Aes256Gcm::new(&key_arr.into());
let nonce = Nonce::from_slice(nonce_bytes);
cipher
.decrypt(nonce, ct)
.map_err(|_| CryptoError::DecryptionFailed(0))
}
pub async fn generate_data_key(
&self,
key_id: &str,
num_bytes: usize,
) -> Result<(Vec<u8>, Vec<u8>), CryptoError> {
let kms_key = self.decrypt_data_key(key_id).await?;
if kms_key.len() != 32 {
return Err(CryptoError::InvalidKeySize(kms_key.len()));
}
let mut plaintext_key = vec![0u8; num_bytes];
rand::thread_rng().fill_bytes(&mut plaintext_key);
let key_arr: [u8; 32] = kms_key.try_into().unwrap();
let cipher = Aes256Gcm::new(&key_arr.into());
let mut nonce_bytes = [0u8; 12];
rand::thread_rng().fill_bytes(&mut nonce_bytes);
let nonce = Nonce::from_slice(&nonce_bytes);
let encrypted = cipher
.encrypt(nonce, plaintext_key.as_slice())
.map_err(|e| CryptoError::EncryptionFailed(e.to_string()))?;
let mut wrapped = Vec::with_capacity(12 + encrypted.len());
wrapped.extend_from_slice(&nonce_bytes);
wrapped.extend_from_slice(&encrypted);
Ok((plaintext_key, wrapped))
}
}
pub async fn load_or_create_master_key(keys_dir: &Path) -> Result<[u8; 32], CryptoError> {
std::fs::create_dir_all(keys_dir).map_err(CryptoError::Io)?;
let path = keys_dir.join("master.key");
if path.exists() {
let encoded = std::fs::read_to_string(&path).map_err(CryptoError::Io)?;
let decoded = B64.decode(encoded.trim()).map_err(|e| {
CryptoError::EncryptionFailed(format!("Bad master key encoding: {}", e))
})?;
if decoded.len() != 32 {
return Err(CryptoError::InvalidKeySize(decoded.len()));
}
let mut key = [0u8; 32];
key.copy_from_slice(&decoded);
Ok(key)
} else {
let mut key = [0u8; 32];
rand::thread_rng().fill_bytes(&mut key);
let encoded = B64.encode(key);
std::fs::write(&path, &encoded).map_err(CryptoError::Io)?;
Ok(key)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_create_and_list_keys() {
let dir = tempfile::tempdir().unwrap();
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("test key").await.unwrap();
assert!(key.enabled);
assert_eq!(key.description, "test key");
assert!(key.key_id.len() > 0);
let keys = kms.list_keys().await;
assert_eq!(keys.len(), 1);
assert_eq!(keys[0].key_id, key.key_id);
}
#[tokio::test]
async fn test_enable_disable_key() {
let dir = tempfile::tempdir().unwrap();
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("toggle").await.unwrap();
assert!(key.enabled);
kms.disable_key(&key.key_id).await.unwrap();
let k = kms.get_key(&key.key_id).await.unwrap();
assert!(!k.enabled);
kms.enable_key(&key.key_id).await.unwrap();
let k = kms.get_key(&key.key_id).await.unwrap();
assert!(k.enabled);
}
#[tokio::test]
async fn test_delete_key() {
let dir = tempfile::tempdir().unwrap();
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("doomed").await.unwrap();
assert!(kms.delete_key(&key.key_id).await.unwrap());
assert!(kms.get_key(&key.key_id).await.is_none());
assert_eq!(kms.list_keys().await.len(), 0);
}
#[tokio::test]
async fn test_encrypt_decrypt_data() {
let dir = tempfile::tempdir().unwrap();
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("enc-key").await.unwrap();
let plaintext = b"Hello, KMS!";
let ciphertext = kms.encrypt_data(&key.key_id, plaintext).await.unwrap();
assert_ne!(&ciphertext, plaintext);
let decrypted = kms.decrypt_data(&key.key_id, &ciphertext).await.unwrap();
assert_eq!(decrypted, plaintext);
}
#[tokio::test]
async fn test_generate_data_key() {
let dir = tempfile::tempdir().unwrap();
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("data-key-gen").await.unwrap();
let (plaintext, wrapped) = kms.generate_data_key(&key.key_id, 32).await.unwrap();
assert_eq!(plaintext.len(), 32);
assert!(wrapped.len() > 32);
}
#[tokio::test]
async fn test_disabled_key_cannot_encrypt() {
let dir = tempfile::tempdir().unwrap();
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("disabled").await.unwrap();
kms.disable_key(&key.key_id).await.unwrap();
let result = kms.encrypt_data(&key.key_id, b"test").await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_persistence_across_reload() {
let dir = tempfile::tempdir().unwrap();
let key_id = {
let kms = KmsService::new(dir.path()).await.unwrap();
let key = kms.create_key("persistent").await.unwrap();
key.key_id
};
let kms2 = KmsService::new(dir.path()).await.unwrap();
let key = kms2.get_key(&key_id).await;
assert!(key.is_some());
assert_eq!(key.unwrap().description, "persistent");
}
#[tokio::test]
async fn test_master_key_roundtrip() {
let dir = tempfile::tempdir().unwrap();
let key1 = load_or_create_master_key(dir.path()).await.unwrap();
let key2 = load_or_create_master_key(dir.path()).await.unwrap();
assert_eq!(key1, key2);
}
}

View File

@@ -0,0 +1,4 @@
pub mod hashing;
pub mod aes_gcm;
pub mod kms;
pub mod encryption;

View File

@@ -0,0 +1,39 @@
[package]
name = "myfsio-server"
version = "0.1.0"
edition = "2021"
[dependencies]
myfsio-common = { path = "../myfsio-common" }
myfsio-auth = { path = "../myfsio-auth" }
myfsio-crypto = { path = "../myfsio-crypto" }
myfsio-storage = { path = "../myfsio-storage" }
myfsio-xml = { path = "../myfsio-xml" }
base64 = { workspace = true }
axum = { workspace = true }
tokio = { workspace = true }
tower = { workspace = true }
tower-http = { workspace = true }
hyper = { workspace = true }
bytes = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
tokio-util = { workspace = true }
chrono = { workspace = true }
uuid = { workspace = true }
futures = { workspace = true }
http-body-util = "0.1"
percent-encoding = { workspace = true }
quick-xml = { workspace = true }
mime_guess = "2"
crc32fast = { workspace = true }
duckdb = { workspace = true }
roxmltree = "0.20"
parking_lot = { workspace = true }
regex = "1"
[dev-dependencies]
tempfile = "3"
tower = { workspace = true, features = ["util"] }

View File

@@ -0,0 +1,117 @@
use std::net::SocketAddr;
use std::path::PathBuf;
#[derive(Debug, Clone)]
pub struct ServerConfig {
pub bind_addr: SocketAddr,
pub storage_root: PathBuf,
pub region: String,
pub iam_config_path: PathBuf,
pub sigv4_timestamp_tolerance_secs: u64,
pub presigned_url_min_expiry: u64,
pub presigned_url_max_expiry: u64,
pub secret_key: Option<String>,
pub encryption_enabled: bool,
pub kms_enabled: bool,
pub gc_enabled: bool,
pub integrity_enabled: bool,
pub metrics_enabled: bool,
pub lifecycle_enabled: bool,
pub website_hosting_enabled: bool,
}
impl ServerConfig {
pub fn from_env() -> Self {
let host = std::env::var("HOST").unwrap_or_else(|_| "127.0.0.1".to_string());
let port: u16 = std::env::var("PORT")
.unwrap_or_else(|_| "5000".to_string())
.parse()
.unwrap_or(5000);
let storage_root = std::env::var("STORAGE_ROOT")
.unwrap_or_else(|_| "./data".to_string());
let region = std::env::var("AWS_REGION")
.unwrap_or_else(|_| "us-east-1".to_string());
let storage_path = PathBuf::from(&storage_root);
let iam_config_path = std::env::var("IAM_CONFIG")
.map(PathBuf::from)
.unwrap_or_else(|_| {
storage_path.join(".myfsio.sys").join("config").join("iam.json")
});
let sigv4_timestamp_tolerance_secs: u64 = std::env::var("SIGV4_TIMESTAMP_TOLERANCE_SECONDS")
.unwrap_or_else(|_| "900".to_string())
.parse()
.unwrap_or(900);
let presigned_url_min_expiry: u64 = std::env::var("PRESIGNED_URL_MIN_EXPIRY_SECONDS")
.unwrap_or_else(|_| "1".to_string())
.parse()
.unwrap_or(1);
let presigned_url_max_expiry: u64 = std::env::var("PRESIGNED_URL_MAX_EXPIRY_SECONDS")
.unwrap_or_else(|_| "604800".to_string())
.parse()
.unwrap_or(604800);
let secret_key = {
let env_key = std::env::var("SECRET_KEY").ok();
match env_key {
Some(k) if !k.is_empty() && k != "dev-secret-key" => Some(k),
_ => {
let secret_file = storage_path
.join(".myfsio.sys")
.join("config")
.join(".secret");
std::fs::read_to_string(&secret_file).ok().map(|s| s.trim().to_string())
}
}
};
let encryption_enabled = std::env::var("ENCRYPTION_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
let kms_enabled = std::env::var("KMS_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
let gc_enabled = std::env::var("GC_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
let integrity_enabled = std::env::var("INTEGRITY_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
let metrics_enabled = std::env::var("OPERATION_METRICS_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
let lifecycle_enabled = std::env::var("LIFECYCLE_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
let website_hosting_enabled = std::env::var("WEBSITE_HOSTING_ENABLED")
.unwrap_or_else(|_| "false".to_string())
.to_lowercase() == "true";
Self {
bind_addr: SocketAddr::new(host.parse().unwrap(), port),
storage_root: storage_path,
region,
iam_config_path,
sigv4_timestamp_tolerance_secs,
presigned_url_min_expiry,
presigned_url_max_expiry,
secret_key,
encryption_enabled,
kms_enabled,
gc_enabled,
integrity_enabled,
metrics_enabled,
lifecycle_enabled,
website_hosting_enabled,
}
}
}

View File

@@ -0,0 +1,704 @@
use axum::body::Body;
use axum::extract::{Path, State};
use axum::http::StatusCode;
use axum::response::{IntoResponse, Response};
use axum::Extension;
use myfsio_common::types::Principal;
use myfsio_storage::traits::StorageEngine;
use crate::services::site_registry::{PeerSite, SiteInfo};
use crate::services::website_domains::{is_valid_domain, normalize_domain};
use crate::state::AppState;
fn json_response(status: StatusCode, value: serde_json::Value) -> Response {
(
status,
[("content-type", "application/json")],
value.to_string(),
)
.into_response()
}
fn json_error(code: &str, message: &str, status: StatusCode) -> Response {
json_response(
status,
serde_json::json!({"error": {"code": code, "message": message}}),
)
}
fn require_admin(principal: &Principal) -> Option<Response> {
if !principal.is_admin {
return Some(json_error("AccessDenied", "Admin access required", StatusCode::FORBIDDEN));
}
None
}
async fn read_json_body(body: Body) -> Option<serde_json::Value> {
let bytes = http_body_util::BodyExt::collect(body).await.ok()?.to_bytes();
serde_json::from_slice(&bytes).ok()
}
fn validate_site_id(site_id: &str) -> Option<String> {
if site_id.is_empty() || site_id.len() > 63 {
return Some("site_id must be 1-63 characters".to_string());
}
let first = site_id.chars().next().unwrap();
if !first.is_ascii_alphanumeric() {
return Some("site_id must start with alphanumeric".to_string());
}
if !site_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') {
return Some("site_id must contain only alphanumeric, hyphens, underscores".to_string());
}
None
}
fn validate_endpoint(endpoint: &str) -> Option<String> {
if !endpoint.starts_with("http://") && !endpoint.starts_with("https://") {
return Some("Endpoint must be http or https URL".to_string());
}
None
}
fn validate_region(region: &str) -> Option<String> {
let re = regex::Regex::new(r"^[a-z]{2,}-[a-z]+-\d+$").unwrap();
if !re.is_match(region) {
return Some("Region must match format like us-east-1".to_string());
}
None
}
fn validate_priority(priority: i64) -> Option<String> {
if priority < 0 || priority > 1000 {
return Some("Priority must be between 0 and 1000".to_string());
}
None
}
pub async fn get_local_site(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
if let Some(ref registry) = state.site_registry {
if let Some(local) = registry.get_local_site() {
return json_response(StatusCode::OK, serde_json::to_value(&local).unwrap());
}
}
json_error("NotFound", "Local site not configured", StatusCode::NOT_FOUND)
}
pub async fn update_local_site(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("InvalidRequest", "Site registry not available", StatusCode::BAD_REQUEST),
};
let payload = match read_json_body(body).await {
Some(v) => v,
None => return json_error("MalformedJSON", "Invalid JSON body", StatusCode::BAD_REQUEST),
};
let site_id = match payload.get("site_id").and_then(|v| v.as_str()) {
Some(s) => s.to_string(),
None => return json_error("ValidationError", "site_id is required", StatusCode::BAD_REQUEST),
};
if let Some(err) = validate_site_id(&site_id) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
let endpoint = payload.get("endpoint").and_then(|v| v.as_str()).unwrap_or("").to_string();
if !endpoint.is_empty() {
if let Some(err) = validate_endpoint(&endpoint) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
}
if let Some(p) = payload.get("priority").and_then(|v| v.as_i64()) {
if let Some(err) = validate_priority(p) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
}
if let Some(r) = payload.get("region").and_then(|v| v.as_str()) {
if let Some(err) = validate_region(r) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
}
let existing = registry.get_local_site();
let site = SiteInfo {
site_id: site_id.clone(),
endpoint,
region: payload.get("region").and_then(|v| v.as_str()).unwrap_or("us-east-1").to_string(),
priority: payload.get("priority").and_then(|v| v.as_i64()).unwrap_or(100) as i32,
display_name: payload.get("display_name").and_then(|v| v.as_str()).unwrap_or(&site_id).to_string(),
created_at: existing.and_then(|e| e.created_at),
};
registry.set_local_site(site.clone());
json_response(StatusCode::OK, serde_json::to_value(&site).unwrap())
}
pub async fn list_all_sites(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_response(StatusCode::OK, serde_json::json!({"local": null, "peers": [], "total_peers": 0})),
};
let local = registry.get_local_site();
let peers = registry.list_peers();
json_response(StatusCode::OK, serde_json::json!({
"local": local,
"peers": peers,
"total_peers": peers.len(),
}))
}
pub async fn register_peer_site(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("InvalidRequest", "Site registry not available", StatusCode::BAD_REQUEST),
};
let payload = match read_json_body(body).await {
Some(v) => v,
None => return json_error("MalformedJSON", "Invalid JSON body", StatusCode::BAD_REQUEST),
};
let site_id = match payload.get("site_id").and_then(|v| v.as_str()) {
Some(s) => s.to_string(),
None => return json_error("ValidationError", "site_id is required", StatusCode::BAD_REQUEST),
};
if let Some(err) = validate_site_id(&site_id) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
let endpoint = match payload.get("endpoint").and_then(|v| v.as_str()) {
Some(e) => e.to_string(),
None => return json_error("ValidationError", "endpoint is required", StatusCode::BAD_REQUEST),
};
if let Some(err) = validate_endpoint(&endpoint) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
let region = payload.get("region").and_then(|v| v.as_str()).unwrap_or("us-east-1").to_string();
if let Some(err) = validate_region(&region) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
let priority = payload.get("priority").and_then(|v| v.as_i64()).unwrap_or(100);
if let Some(err) = validate_priority(priority) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
if registry.get_peer(&site_id).is_some() {
return json_error("AlreadyExists", &format!("Peer site '{}' already exists", site_id), StatusCode::CONFLICT);
}
let peer = PeerSite {
site_id: site_id.clone(),
endpoint,
region,
priority: priority as i32,
display_name: payload.get("display_name").and_then(|v| v.as_str()).unwrap_or(&site_id).to_string(),
connection_id: payload.get("connection_id").and_then(|v| v.as_str()).map(|s| s.to_string()),
created_at: Some(chrono::Utc::now().to_rfc3339()),
is_healthy: false,
last_health_check: None,
};
registry.add_peer(peer.clone());
json_response(StatusCode::CREATED, serde_json::to_value(&peer).unwrap())
}
pub async fn get_peer_site(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(site_id): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("NotFound", "Site registry not available", StatusCode::NOT_FOUND),
};
match registry.get_peer(&site_id) {
Some(peer) => json_response(StatusCode::OK, serde_json::to_value(&peer).unwrap()),
None => json_error("NotFound", &format!("Peer site '{}' not found", site_id), StatusCode::NOT_FOUND),
}
}
pub async fn update_peer_site(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(site_id): Path<String>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("NotFound", "Site registry not available", StatusCode::NOT_FOUND),
};
let existing = match registry.get_peer(&site_id) {
Some(p) => p,
None => return json_error("NotFound", &format!("Peer site '{}' not found", site_id), StatusCode::NOT_FOUND),
};
let payload = match read_json_body(body).await {
Some(v) => v,
None => return json_error("MalformedJSON", "Invalid JSON body", StatusCode::BAD_REQUEST),
};
if let Some(ep) = payload.get("endpoint").and_then(|v| v.as_str()) {
if let Some(err) = validate_endpoint(ep) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
}
if let Some(p) = payload.get("priority").and_then(|v| v.as_i64()) {
if let Some(err) = validate_priority(p) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
}
if let Some(r) = payload.get("region").and_then(|v| v.as_str()) {
if let Some(err) = validate_region(r) {
return json_error("ValidationError", &err, StatusCode::BAD_REQUEST);
}
}
let peer = PeerSite {
site_id: site_id.clone(),
endpoint: payload.get("endpoint").and_then(|v| v.as_str()).unwrap_or(&existing.endpoint).to_string(),
region: payload.get("region").and_then(|v| v.as_str()).unwrap_or(&existing.region).to_string(),
priority: payload.get("priority").and_then(|v| v.as_i64()).unwrap_or(existing.priority as i64) as i32,
display_name: payload.get("display_name").and_then(|v| v.as_str()).unwrap_or(&existing.display_name).to_string(),
connection_id: payload.get("connection_id").and_then(|v| v.as_str()).map(|s| s.to_string()).or(existing.connection_id),
created_at: existing.created_at,
is_healthy: existing.is_healthy,
last_health_check: existing.last_health_check,
};
registry.update_peer(peer.clone());
json_response(StatusCode::OK, serde_json::to_value(&peer).unwrap())
}
pub async fn delete_peer_site(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(site_id): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("NotFound", "Site registry not available", StatusCode::NOT_FOUND),
};
if !registry.delete_peer(&site_id) {
return json_error("NotFound", &format!("Peer site '{}' not found", site_id), StatusCode::NOT_FOUND);
}
StatusCode::NO_CONTENT.into_response()
}
pub async fn check_peer_health(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(site_id): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("NotFound", "Site registry not available", StatusCode::NOT_FOUND),
};
if registry.get_peer(&site_id).is_none() {
return json_error("NotFound", &format!("Peer site '{}' not found", site_id), StatusCode::NOT_FOUND);
}
json_response(StatusCode::OK, serde_json::json!({
"site_id": site_id,
"is_healthy": false,
"error": "Health check not implemented in standalone mode",
"checked_at": chrono::Utc::now().timestamp_millis() as f64 / 1000.0,
}))
}
pub async fn get_topology(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_response(StatusCode::OK, serde_json::json!({"sites": [], "total": 0, "healthy_count": 0})),
};
let local = registry.get_local_site();
let peers = registry.list_peers();
let mut sites: Vec<serde_json::Value> = Vec::new();
if let Some(l) = local {
let mut v = serde_json::to_value(&l).unwrap();
v.as_object_mut().unwrap().insert("is_local".to_string(), serde_json::json!(true));
v.as_object_mut().unwrap().insert("is_healthy".to_string(), serde_json::json!(true));
sites.push(v);
}
for p in &peers {
let mut v = serde_json::to_value(p).unwrap();
v.as_object_mut().unwrap().insert("is_local".to_string(), serde_json::json!(false));
sites.push(v);
}
sites.sort_by_key(|s| s.get("priority").and_then(|v| v.as_i64()).unwrap_or(100));
let healthy_count = sites.iter().filter(|s| s.get("is_healthy").and_then(|v| v.as_bool()).unwrap_or(false)).count();
json_response(StatusCode::OK, serde_json::json!({
"sites": sites,
"total": sites.len(),
"healthy_count": healthy_count,
}))
}
pub async fn check_bidirectional_status(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(site_id): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let registry = match &state.site_registry {
Some(r) => r,
None => return json_error("NotFound", "Site registry not available", StatusCode::NOT_FOUND),
};
if registry.get_peer(&site_id).is_none() {
return json_error("NotFound", &format!("Peer site '{}' not found", site_id), StatusCode::NOT_FOUND);
}
let local = registry.get_local_site();
json_response(StatusCode::OK, serde_json::json!({
"site_id": site_id,
"local_site_id": local.as_ref().map(|l| &l.site_id),
"local_endpoint": local.as_ref().map(|l| &l.endpoint),
"local_bidirectional_rules": [],
"local_site_sync_enabled": false,
"remote_status": null,
"issues": [{"code": "NOT_IMPLEMENTED", "message": "Bidirectional status check not implemented in standalone mode", "severity": "warning"}],
"is_fully_configured": false,
}))
}
pub async fn iam_list_users(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let users = state.iam.list_users().await;
json_response(StatusCode::OK, serde_json::json!({"users": users}))
}
pub async fn iam_get_user(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(identifier): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match state.iam.get_user(&identifier).await {
Some(user) => json_response(StatusCode::OK, user),
None => json_error("NotFound", &format!("User '{}' not found", identifier), StatusCode::NOT_FOUND),
}
}
pub async fn iam_get_user_policies(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(identifier): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match state.iam.get_user_policies(&identifier) {
Some(policies) => json_response(StatusCode::OK, serde_json::json!({"policies": policies})),
None => json_error("NotFound", &format!("User '{}' not found", identifier), StatusCode::NOT_FOUND),
}
}
pub async fn iam_create_access_key(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(identifier): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match state.iam.create_access_key(&identifier) {
Ok(result) => json_response(StatusCode::CREATED, result),
Err(e) => json_error("InvalidRequest", &e, StatusCode::BAD_REQUEST),
}
}
pub async fn iam_delete_access_key(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path((_identifier, access_key)): Path<(String, String)>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match state.iam.delete_access_key(&access_key) {
Ok(()) => StatusCode::NO_CONTENT.into_response(),
Err(e) => json_error("InvalidRequest", &e, StatusCode::BAD_REQUEST),
}
}
pub async fn iam_disable_user(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(identifier): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match state.iam.set_user_enabled(&identifier, false).await {
Ok(()) => json_response(StatusCode::OK, serde_json::json!({"status": "disabled"})),
Err(e) => json_error("InvalidRequest", &e, StatusCode::BAD_REQUEST),
}
}
pub async fn iam_enable_user(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(identifier): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match state.iam.set_user_enabled(&identifier, true).await {
Ok(()) => json_response(StatusCode::OK, serde_json::json!({"status": "enabled"})),
Err(e) => json_error("InvalidRequest", &e, StatusCode::BAD_REQUEST),
}
}
pub async fn list_website_domains(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let store = match &state.website_domains {
Some(s) => s,
None => return json_error("InvalidRequest", "Website hosting is not enabled", StatusCode::BAD_REQUEST),
};
json_response(StatusCode::OK, serde_json::json!(store.list_all()))
}
pub async fn create_website_domain(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let store = match &state.website_domains {
Some(s) => s,
None => return json_error("InvalidRequest", "Website hosting is not enabled", StatusCode::BAD_REQUEST),
};
let payload = match read_json_body(body).await {
Some(v) => v,
None => return json_error("MalformedJSON", "Invalid JSON body", StatusCode::BAD_REQUEST),
};
let domain = normalize_domain(payload.get("domain").and_then(|v| v.as_str()).unwrap_or(""));
if domain.is_empty() {
return json_error("ValidationError", "domain is required", StatusCode::BAD_REQUEST);
}
if !is_valid_domain(&domain) {
return json_error("ValidationError", &format!("Invalid domain: '{}'", domain), StatusCode::BAD_REQUEST);
}
let bucket = payload.get("bucket").and_then(|v| v.as_str()).unwrap_or("").trim().to_string();
if bucket.is_empty() {
return json_error("ValidationError", "bucket is required", StatusCode::BAD_REQUEST);
}
match state.storage.bucket_exists(&bucket).await {
Ok(true) => {}
_ => return json_error("NoSuchBucket", &format!("Bucket '{}' does not exist", bucket), StatusCode::NOT_FOUND),
}
if store.get_bucket(&domain).is_some() {
return json_error("Conflict", &format!("Domain '{}' is already mapped", domain), StatusCode::CONFLICT);
}
store.set_mapping(&domain, &bucket);
json_response(StatusCode::CREATED, serde_json::json!({"domain": domain, "bucket": bucket}))
}
pub async fn get_website_domain(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(domain): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let store = match &state.website_domains {
Some(s) => s,
None => return json_error("InvalidRequest", "Website hosting is not enabled", StatusCode::BAD_REQUEST),
};
let domain = normalize_domain(&domain);
match store.get_bucket(&domain) {
Some(bucket) => json_response(StatusCode::OK, serde_json::json!({"domain": domain, "bucket": bucket})),
None => json_error("NotFound", &format!("No mapping found for domain '{}'", domain), StatusCode::NOT_FOUND),
}
}
pub async fn update_website_domain(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(domain): Path<String>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let store = match &state.website_domains {
Some(s) => s,
None => return json_error("InvalidRequest", "Website hosting is not enabled", StatusCode::BAD_REQUEST),
};
let domain = normalize_domain(&domain);
let payload = match read_json_body(body).await {
Some(v) => v,
None => return json_error("MalformedJSON", "Invalid JSON body", StatusCode::BAD_REQUEST),
};
let bucket = payload.get("bucket").and_then(|v| v.as_str()).unwrap_or("").trim().to_string();
if bucket.is_empty() {
return json_error("ValidationError", "bucket is required", StatusCode::BAD_REQUEST);
}
match state.storage.bucket_exists(&bucket).await {
Ok(true) => {}
_ => return json_error("NoSuchBucket", &format!("Bucket '{}' does not exist", bucket), StatusCode::NOT_FOUND),
}
if store.get_bucket(&domain).is_none() {
return json_error("NotFound", &format!("No mapping found for domain '{}'", domain), StatusCode::NOT_FOUND);
}
store.set_mapping(&domain, &bucket);
json_response(StatusCode::OK, serde_json::json!({"domain": domain, "bucket": bucket}))
}
pub async fn delete_website_domain(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Path(domain): Path<String>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let store = match &state.website_domains {
Some(s) => s,
None => return json_error("InvalidRequest", "Website hosting is not enabled", StatusCode::BAD_REQUEST),
};
let domain = normalize_domain(&domain);
if !store.delete_mapping(&domain) {
return json_error("NotFound", &format!("No mapping found for domain '{}'", domain), StatusCode::NOT_FOUND);
}
StatusCode::NO_CONTENT.into_response()
}
#[derive(serde::Deserialize, Default)]
pub struct PaginationQuery {
pub limit: Option<usize>,
pub offset: Option<usize>,
}
pub async fn gc_status(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match &state.gc {
Some(gc) => json_response(StatusCode::OK, gc.status().await),
None => json_response(StatusCode::OK, serde_json::json!({"enabled": false, "message": "GC is not enabled. Set GC_ENABLED=true to enable."})),
}
}
pub async fn gc_run(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let gc = match &state.gc {
Some(gc) => gc,
None => return json_error("InvalidRequest", "GC is not enabled", StatusCode::BAD_REQUEST),
};
let payload = read_json_body(body).await.unwrap_or(serde_json::json!({}));
let dry_run = payload.get("dry_run").and_then(|v| v.as_bool()).unwrap_or(false);
match gc.run_now(dry_run).await {
Ok(result) => json_response(StatusCode::OK, result),
Err(e) => json_error("Conflict", &e, StatusCode::CONFLICT),
}
}
pub async fn gc_history(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match &state.gc {
Some(gc) => json_response(StatusCode::OK, serde_json::json!({"executions": gc.history().await})),
None => json_response(StatusCode::OK, serde_json::json!({"executions": []})),
}
}
pub async fn integrity_status(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match &state.integrity {
Some(checker) => json_response(StatusCode::OK, checker.status().await),
None => json_response(StatusCode::OK, serde_json::json!({"enabled": false, "message": "Integrity checker is not enabled. Set INTEGRITY_ENABLED=true to enable."})),
}
}
pub async fn integrity_run(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
body: Body,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
let checker = match &state.integrity {
Some(c) => c,
None => return json_error("InvalidRequest", "Integrity checker is not enabled", StatusCode::BAD_REQUEST),
};
let payload = read_json_body(body).await.unwrap_or(serde_json::json!({}));
let dry_run = payload.get("dry_run").and_then(|v| v.as_bool()).unwrap_or(false);
let auto_heal = payload.get("auto_heal").and_then(|v| v.as_bool()).unwrap_or(false);
match checker.run_now(dry_run, auto_heal).await {
Ok(result) => json_response(StatusCode::OK, result),
Err(e) => json_error("Conflict", &e, StatusCode::CONFLICT),
}
}
pub async fn integrity_history(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> Response {
if let Some(err) = require_admin(&principal) { return err; }
match &state.integrity {
Some(checker) => json_response(StatusCode::OK, serde_json::json!({"executions": checker.history().await})),
None => json_response(StatusCode::OK, serde_json::json!({"executions": []})),
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,278 @@
use axum::body::Body;
use axum::extract::State;
use axum::http::StatusCode;
use axum::response::{IntoResponse, Response};
use base64::engine::general_purpose::STANDARD as B64;
use base64::Engine;
use serde_json::json;
use crate::state::AppState;
fn json_ok(value: serde_json::Value) -> Response {
(
StatusCode::OK,
[("content-type", "application/json")],
value.to_string(),
)
.into_response()
}
fn json_err(status: StatusCode, msg: &str) -> Response {
(
status,
[("content-type", "application/json")],
json!({"error": msg}).to_string(),
)
.into_response()
}
pub async fn list_keys(State(state): State<AppState>) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
let keys = kms.list_keys().await;
let keys_json: Vec<serde_json::Value> = keys
.iter()
.map(|k| {
json!({
"KeyId": k.key_id,
"Arn": k.arn,
"Description": k.description,
"CreationDate": k.creation_date.to_rfc3339(),
"Enabled": k.enabled,
"KeyState": k.key_state,
"KeyUsage": k.key_usage,
"KeySpec": k.key_spec,
})
})
.collect();
json_ok(json!({"keys": keys_json}))
}
pub async fn create_key(State(state): State<AppState>, body: Body) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
let body_bytes = match http_body_util::BodyExt::collect(body).await {
Ok(c) => c.to_bytes(),
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid request body"),
};
let description = if body_bytes.is_empty() {
String::new()
} else {
match serde_json::from_slice::<serde_json::Value>(&body_bytes) {
Ok(v) => v
.get("Description")
.or_else(|| v.get("description"))
.and_then(|d| d.as_str())
.unwrap_or("")
.to_string(),
Err(_) => String::new(),
}
};
match kms.create_key(&description).await {
Ok(key) => json_ok(json!({
"KeyId": key.key_id,
"Arn": key.arn,
"Description": key.description,
"CreationDate": key.creation_date.to_rfc3339(),
"Enabled": key.enabled,
"KeyState": key.key_state,
})),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}
pub async fn get_key(
State(state): State<AppState>,
axum::extract::Path(key_id): axum::extract::Path<String>,
) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
match kms.get_key(&key_id).await {
Some(key) => json_ok(json!({
"KeyId": key.key_id,
"Arn": key.arn,
"Description": key.description,
"CreationDate": key.creation_date.to_rfc3339(),
"Enabled": key.enabled,
"KeyState": key.key_state,
"KeyUsage": key.key_usage,
"KeySpec": key.key_spec,
})),
None => json_err(StatusCode::NOT_FOUND, "Key not found"),
}
}
pub async fn delete_key(
State(state): State<AppState>,
axum::extract::Path(key_id): axum::extract::Path<String>,
) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
match kms.delete_key(&key_id).await {
Ok(true) => StatusCode::NO_CONTENT.into_response(),
Ok(false) => json_err(StatusCode::NOT_FOUND, "Key not found"),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}
pub async fn enable_key(
State(state): State<AppState>,
axum::extract::Path(key_id): axum::extract::Path<String>,
) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
match kms.enable_key(&key_id).await {
Ok(true) => json_ok(json!({"status": "enabled"})),
Ok(false) => json_err(StatusCode::NOT_FOUND, "Key not found"),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}
pub async fn disable_key(
State(state): State<AppState>,
axum::extract::Path(key_id): axum::extract::Path<String>,
) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
match kms.disable_key(&key_id).await {
Ok(true) => json_ok(json!({"status": "disabled"})),
Ok(false) => json_err(StatusCode::NOT_FOUND, "Key not found"),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}
pub async fn encrypt(State(state): State<AppState>, body: Body) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
let body_bytes = match http_body_util::BodyExt::collect(body).await {
Ok(c) => c.to_bytes(),
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid request body"),
};
let req: serde_json::Value = match serde_json::from_slice(&body_bytes) {
Ok(v) => v,
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid JSON"),
};
let key_id = match req.get("KeyId").and_then(|v| v.as_str()) {
Some(k) => k,
None => return json_err(StatusCode::BAD_REQUEST, "Missing KeyId"),
};
let plaintext_b64 = match req.get("Plaintext").and_then(|v| v.as_str()) {
Some(p) => p,
None => return json_err(StatusCode::BAD_REQUEST, "Missing Plaintext"),
};
let plaintext = match B64.decode(plaintext_b64) {
Ok(p) => p,
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid base64 Plaintext"),
};
match kms.encrypt_data(key_id, &plaintext).await {
Ok(ct) => json_ok(json!({
"KeyId": key_id,
"CiphertextBlob": B64.encode(&ct),
})),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}
pub async fn decrypt(State(state): State<AppState>, body: Body) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
let body_bytes = match http_body_util::BodyExt::collect(body).await {
Ok(c) => c.to_bytes(),
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid request body"),
};
let req: serde_json::Value = match serde_json::from_slice(&body_bytes) {
Ok(v) => v,
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid JSON"),
};
let key_id = match req.get("KeyId").and_then(|v| v.as_str()) {
Some(k) => k,
None => return json_err(StatusCode::BAD_REQUEST, "Missing KeyId"),
};
let ct_b64 = match req.get("CiphertextBlob").and_then(|v| v.as_str()) {
Some(c) => c,
None => return json_err(StatusCode::BAD_REQUEST, "Missing CiphertextBlob"),
};
let ciphertext = match B64.decode(ct_b64) {
Ok(c) => c,
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid base64"),
};
match kms.decrypt_data(key_id, &ciphertext).await {
Ok(pt) => json_ok(json!({
"KeyId": key_id,
"Plaintext": B64.encode(&pt),
})),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}
pub async fn generate_data_key(State(state): State<AppState>, body: Body) -> Response {
let kms = match &state.kms {
Some(k) => k,
None => return json_err(StatusCode::SERVICE_UNAVAILABLE, "KMS not enabled"),
};
let body_bytes = match http_body_util::BodyExt::collect(body).await {
Ok(c) => c.to_bytes(),
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid request body"),
};
let req: serde_json::Value = match serde_json::from_slice(&body_bytes) {
Ok(v) => v,
Err(_) => return json_err(StatusCode::BAD_REQUEST, "Invalid JSON"),
};
let key_id = match req.get("KeyId").and_then(|v| v.as_str()) {
Some(k) => k,
None => return json_err(StatusCode::BAD_REQUEST, "Missing KeyId"),
};
let num_bytes = req
.get("NumberOfBytes")
.and_then(|v| v.as_u64())
.unwrap_or(32) as usize;
if num_bytes < 1 || num_bytes > 1024 {
return json_err(StatusCode::BAD_REQUEST, "NumberOfBytes must be 1-1024");
}
match kms.generate_data_key(key_id, num_bytes).await {
Ok((plaintext, wrapped)) => json_ok(json!({
"KeyId": key_id,
"Plaintext": B64.encode(&plaintext),
"CiphertextBlob": B64.encode(&wrapped),
})),
Err(e) => json_err(StatusCode::INTERNAL_SERVER_ERROR, &e.to_string()),
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,552 @@
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use axum::body::Body;
use axum::http::{HeaderMap, HeaderName, StatusCode};
use axum::response::{IntoResponse, Response};
use base64::Engine;
use bytes::Bytes;
use crc32fast::Hasher;
use duckdb::types::ValueRef;
use duckdb::Connection;
use futures::stream;
use http_body_util::BodyExt;
use myfsio_common::error::{S3Error, S3ErrorCode};
use myfsio_storage::traits::StorageEngine;
use crate::state::AppState;
#[cfg(target_os = "windows")]
#[link(name = "Rstrtmgr")]
extern "system" {}
const CHUNK_SIZE: usize = 65_536;
pub async fn post_select_object_content(
state: &AppState,
bucket: &str,
key: &str,
headers: &HeaderMap,
body: Body,
) -> Response {
if let Some(resp) = require_xml_content_type(headers) {
return resp;
}
let body_bytes = match body.collect().await {
Ok(collected) => collected.to_bytes(),
Err(_) => {
return s3_error_response(S3Error::new(
S3ErrorCode::MalformedXML,
"Unable to parse XML document",
));
}
};
let request = match parse_select_request(&body_bytes) {
Ok(r) => r,
Err(err) => return s3_error_response(err),
};
let object_path = match state.storage.get_object_path(bucket, key).await {
Ok(path) => path,
Err(_) => {
return s3_error_response(S3Error::new(
S3ErrorCode::NoSuchKey,
"Object not found",
));
}
};
let join_res = tokio::task::spawn_blocking(move || execute_select_query(object_path, request)).await;
let chunks = match join_res {
Ok(Ok(chunks)) => chunks,
Ok(Err(message)) => {
return s3_error_response(S3Error::new(S3ErrorCode::InvalidRequest, message));
}
Err(_) => {
return s3_error_response(S3Error::new(
S3ErrorCode::InternalError,
"SelectObjectContent execution failed",
));
}
};
let bytes_returned: usize = chunks.iter().map(|c| c.len()).sum();
let mut events: Vec<Bytes> = Vec::with_capacity(chunks.len() + 2);
for chunk in chunks {
events.push(Bytes::from(encode_select_event("Records", &chunk)));
}
let stats_payload = build_stats_xml(0, bytes_returned);
events.push(Bytes::from(encode_select_event("Stats", stats_payload.as_bytes())));
events.push(Bytes::from(encode_select_event("End", b"")));
let stream = stream::iter(events.into_iter().map(Ok::<Bytes, std::io::Error>));
let body = Body::from_stream(stream);
let mut response = (StatusCode::OK, body).into_response();
response.headers_mut().insert(
HeaderName::from_static("content-type"),
"application/octet-stream".parse().unwrap(),
);
response.headers_mut().insert(
HeaderName::from_static("x-amz-request-charged"),
"requester".parse().unwrap(),
);
response
}
#[derive(Clone)]
struct SelectRequest {
expression: String,
input_format: InputFormat,
output_format: OutputFormat,
}
#[derive(Clone)]
enum InputFormat {
Csv(CsvInputConfig),
Json(JsonInputConfig),
Parquet,
}
#[derive(Clone)]
struct CsvInputConfig {
file_header_info: String,
field_delimiter: String,
quote_character: String,
}
#[derive(Clone)]
struct JsonInputConfig {
json_type: String,
}
#[derive(Clone)]
enum OutputFormat {
Csv(CsvOutputConfig),
Json(JsonOutputConfig),
}
#[derive(Clone)]
struct CsvOutputConfig {
field_delimiter: String,
record_delimiter: String,
quote_character: String,
}
#[derive(Clone)]
struct JsonOutputConfig {
record_delimiter: String,
}
fn parse_select_request(payload: &[u8]) -> Result<SelectRequest, S3Error> {
let xml = String::from_utf8_lossy(payload);
let doc = roxmltree::Document::parse(&xml)
.map_err(|_| S3Error::new(S3ErrorCode::MalformedXML, "Unable to parse XML document"))?;
let root = doc.root_element();
if root.tag_name().name() != "SelectObjectContentRequest" {
return Err(S3Error::new(
S3ErrorCode::MalformedXML,
"Root element must be SelectObjectContentRequest",
));
}
let expression = child_text(&root, "Expression")
.filter(|v| !v.is_empty())
.ok_or_else(|| S3Error::new(S3ErrorCode::InvalidRequest, "Expression is required"))?;
let expression_type = child_text(&root, "ExpressionType").unwrap_or_else(|| "SQL".to_string());
if !expression_type.eq_ignore_ascii_case("SQL") {
return Err(S3Error::new(
S3ErrorCode::InvalidRequest,
"Only SQL expression type is supported",
));
}
let input_node = child(&root, "InputSerialization")
.ok_or_else(|| S3Error::new(S3ErrorCode::InvalidRequest, "InputSerialization is required"))?;
let output_node = child(&root, "OutputSerialization")
.ok_or_else(|| S3Error::new(S3ErrorCode::InvalidRequest, "OutputSerialization is required"))?;
let input_format = parse_input_format(&input_node)?;
let output_format = parse_output_format(&output_node)?;
Ok(SelectRequest {
expression,
input_format,
output_format,
})
}
fn parse_input_format(node: &roxmltree::Node<'_, '_>) -> Result<InputFormat, S3Error> {
if let Some(csv_node) = child(node, "CSV") {
return Ok(InputFormat::Csv(CsvInputConfig {
file_header_info: child_text(&csv_node, "FileHeaderInfo")
.unwrap_or_else(|| "NONE".to_string())
.to_ascii_uppercase(),
field_delimiter: child_text(&csv_node, "FieldDelimiter").unwrap_or_else(|| ",".to_string()),
quote_character: child_text(&csv_node, "QuoteCharacter").unwrap_or_else(|| "\"".to_string()),
}));
}
if let Some(json_node) = child(node, "JSON") {
return Ok(InputFormat::Json(JsonInputConfig {
json_type: child_text(&json_node, "Type")
.unwrap_or_else(|| "DOCUMENT".to_string())
.to_ascii_uppercase(),
}));
}
if child(node, "Parquet").is_some() {
return Ok(InputFormat::Parquet);
}
Err(S3Error::new(
S3ErrorCode::InvalidRequest,
"InputSerialization must specify CSV, JSON, or Parquet",
))
}
fn parse_output_format(node: &roxmltree::Node<'_, '_>) -> Result<OutputFormat, S3Error> {
if let Some(csv_node) = child(node, "CSV") {
return Ok(OutputFormat::Csv(CsvOutputConfig {
field_delimiter: child_text(&csv_node, "FieldDelimiter").unwrap_or_else(|| ",".to_string()),
record_delimiter: child_text(&csv_node, "RecordDelimiter").unwrap_or_else(|| "\n".to_string()),
quote_character: child_text(&csv_node, "QuoteCharacter").unwrap_or_else(|| "\"".to_string()),
}));
}
if let Some(json_node) = child(node, "JSON") {
return Ok(OutputFormat::Json(JsonOutputConfig {
record_delimiter: child_text(&json_node, "RecordDelimiter").unwrap_or_else(|| "\n".to_string()),
}));
}
Err(S3Error::new(
S3ErrorCode::InvalidRequest,
"OutputSerialization must specify CSV or JSON",
))
}
fn child<'a, 'input>(node: &'a roxmltree::Node<'a, 'input>, name: &str) -> Option<roxmltree::Node<'a, 'input>> {
node.children()
.find(|n| n.is_element() && n.tag_name().name() == name)
}
fn child_text(node: &roxmltree::Node<'_, '_>, name: &str) -> Option<String> {
child(node, name)
.and_then(|n| n.text())
.map(|s| s.to_string())
}
fn execute_select_query(path: PathBuf, request: SelectRequest) -> Result<Vec<Vec<u8>>, String> {
let conn = Connection::open_in_memory().map_err(|e| format!("DuckDB connection error: {}", e))?;
load_input_table(&conn, &path, &request.input_format)?;
let expression = request
.expression
.replace("s3object", "data")
.replace("S3Object", "data");
let mut stmt = conn
.prepare(&expression)
.map_err(|e| format!("SQL execution error: {}", e))?;
let mut rows = stmt
.query([])
.map_err(|e| format!("SQL execution error: {}", e))?;
let stmt_ref = rows
.as_ref()
.ok_or_else(|| "SQL execution error: statement metadata unavailable".to_string())?;
let col_count = stmt_ref.column_count();
let mut columns: Vec<String> = Vec::with_capacity(col_count);
for i in 0..col_count {
let name = stmt_ref
.column_name(i)
.map(|s| s.to_string())
.unwrap_or_else(|_| format!("_{}", i));
columns.push(name);
}
match request.output_format {
OutputFormat::Csv(cfg) => collect_csv_chunks(&mut rows, col_count, cfg),
OutputFormat::Json(cfg) => collect_json_chunks(&mut rows, col_count, &columns, cfg),
}
}
fn load_input_table(conn: &Connection, path: &Path, input: &InputFormat) -> Result<(), String> {
let path_str = path.to_string_lossy().replace('\\', "/");
match input {
InputFormat::Csv(cfg) => {
let header = cfg.file_header_info == "USE" || cfg.file_header_info == "IGNORE";
let delimiter = normalize_single_char(&cfg.field_delimiter, ',');
let quote = normalize_single_char(&cfg.quote_character, '"');
let sql = format!(
"CREATE TABLE data AS SELECT * FROM read_csv('{}', header={}, delim='{}', quote='{}')",
sql_escape(&path_str),
if header { "true" } else { "false" },
sql_escape(&delimiter),
sql_escape(&quote)
);
conn.execute_batch(&sql)
.map_err(|e| format!("Failed loading CSV data: {}", e))?;
}
InputFormat::Json(cfg) => {
let format = if cfg.json_type == "LINES" {
"newline_delimited"
} else {
"array"
};
let sql = format!(
"CREATE TABLE data AS SELECT * FROM read_json_auto('{}', format='{}')",
sql_escape(&path_str),
format
);
conn.execute_batch(&sql)
.map_err(|e| format!("Failed loading JSON data: {}", e))?;
}
InputFormat::Parquet => {
let sql = format!(
"CREATE TABLE data AS SELECT * FROM read_parquet('{}')",
sql_escape(&path_str)
);
conn.execute_batch(&sql)
.map_err(|e| format!("Failed loading Parquet data: {}", e))?;
}
}
Ok(())
}
fn sql_escape(value: &str) -> String {
value.replace('\'', "''")
}
fn normalize_single_char(value: &str, default_char: char) -> String {
value.chars().next().unwrap_or(default_char).to_string()
}
fn collect_csv_chunks(
rows: &mut duckdb::Rows<'_>,
col_count: usize,
cfg: CsvOutputConfig,
) -> Result<Vec<Vec<u8>>, String> {
let delimiter = cfg.field_delimiter;
let record_delimiter = cfg.record_delimiter;
let quote = cfg.quote_character;
let mut chunks: Vec<Vec<u8>> = Vec::new();
let mut buffer = String::new();
while let Some(row) = rows.next().map_err(|e| format!("SQL execution error: {}", e))? {
let mut fields: Vec<String> = Vec::with_capacity(col_count);
for i in 0..col_count {
let value = row
.get_ref(i)
.map_err(|e| format!("SQL execution error: {}", e))?;
if matches!(value, ValueRef::Null) {
fields.push(String::new());
continue;
}
let mut text = value_ref_to_string(value);
if text.contains(&delimiter) || text.contains(&quote) || text.contains(&record_delimiter) {
text = text.replace(&quote, &(quote.clone() + &quote));
text = format!("{}{}{}", quote, text, quote);
}
fields.push(text);
}
buffer.push_str(&fields.join(&delimiter));
buffer.push_str(&record_delimiter);
while buffer.len() >= CHUNK_SIZE {
let rest = buffer.split_off(CHUNK_SIZE);
chunks.push(buffer.into_bytes());
buffer = rest;
}
}
if !buffer.is_empty() {
chunks.push(buffer.into_bytes());
}
Ok(chunks)
}
fn collect_json_chunks(
rows: &mut duckdb::Rows<'_>,
col_count: usize,
columns: &[String],
cfg: JsonOutputConfig,
) -> Result<Vec<Vec<u8>>, String> {
let record_delimiter = cfg.record_delimiter;
let mut chunks: Vec<Vec<u8>> = Vec::new();
let mut buffer = String::new();
while let Some(row) = rows.next().map_err(|e| format!("SQL execution error: {}", e))? {
let mut record: HashMap<String, serde_json::Value> = HashMap::with_capacity(col_count);
for i in 0..col_count {
let value = row
.get_ref(i)
.map_err(|e| format!("SQL execution error: {}", e))?;
let key = columns
.get(i)
.cloned()
.unwrap_or_else(|| format!("_{}", i));
record.insert(key, value_ref_to_json(value));
}
let line = serde_json::to_string(&record)
.map_err(|e| format!("JSON output encoding failed: {}", e))?;
buffer.push_str(&line);
buffer.push_str(&record_delimiter);
while buffer.len() >= CHUNK_SIZE {
let rest = buffer.split_off(CHUNK_SIZE);
chunks.push(buffer.into_bytes());
buffer = rest;
}
}
if !buffer.is_empty() {
chunks.push(buffer.into_bytes());
}
Ok(chunks)
}
fn value_ref_to_string(value: ValueRef<'_>) -> String {
match value {
ValueRef::Null => String::new(),
ValueRef::Boolean(v) => v.to_string(),
ValueRef::TinyInt(v) => v.to_string(),
ValueRef::SmallInt(v) => v.to_string(),
ValueRef::Int(v) => v.to_string(),
ValueRef::BigInt(v) => v.to_string(),
ValueRef::UTinyInt(v) => v.to_string(),
ValueRef::USmallInt(v) => v.to_string(),
ValueRef::UInt(v) => v.to_string(),
ValueRef::UBigInt(v) => v.to_string(),
ValueRef::Float(v) => v.to_string(),
ValueRef::Double(v) => v.to_string(),
ValueRef::Decimal(v) => v.to_string(),
ValueRef::Text(v) => String::from_utf8_lossy(v).into_owned(),
ValueRef::Blob(v) => base64::engine::general_purpose::STANDARD.encode(v),
_ => format!("{:?}", value),
}
}
fn value_ref_to_json(value: ValueRef<'_>) -> serde_json::Value {
match value {
ValueRef::Null => serde_json::Value::Null,
ValueRef::Boolean(v) => serde_json::Value::Bool(v),
ValueRef::TinyInt(v) => serde_json::json!(v),
ValueRef::SmallInt(v) => serde_json::json!(v),
ValueRef::Int(v) => serde_json::json!(v),
ValueRef::BigInt(v) => serde_json::json!(v),
ValueRef::UTinyInt(v) => serde_json::json!(v),
ValueRef::USmallInt(v) => serde_json::json!(v),
ValueRef::UInt(v) => serde_json::json!(v),
ValueRef::UBigInt(v) => serde_json::json!(v),
ValueRef::Float(v) => serde_json::json!(v),
ValueRef::Double(v) => serde_json::json!(v),
ValueRef::Decimal(v) => serde_json::Value::String(v.to_string()),
ValueRef::Text(v) => serde_json::Value::String(String::from_utf8_lossy(v).into_owned()),
ValueRef::Blob(v) => serde_json::Value::String(base64::engine::general_purpose::STANDARD.encode(v)),
_ => serde_json::Value::String(format!("{:?}", value)),
}
}
fn require_xml_content_type(headers: &HeaderMap) -> Option<Response> {
let value = headers
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.trim();
if value.is_empty() {
return None;
}
let lowered = value.to_ascii_lowercase();
if lowered.starts_with("application/xml") || lowered.starts_with("text/xml") {
return None;
}
Some(s3_error_response(S3Error::new(
S3ErrorCode::InvalidRequest,
"Content-Type must be application/xml or text/xml",
)))
}
fn s3_error_response(err: S3Error) -> Response {
let status = StatusCode::from_u16(err.http_status()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
let resource = if err.resource.is_empty() {
"/".to_string()
} else {
err.resource.clone()
};
let body = err
.with_resource(resource)
.with_request_id(uuid::Uuid::new_v4().simple().to_string())
.to_xml();
(
status,
[("content-type", "application/xml")],
body,
)
.into_response()
}
fn build_stats_xml(bytes_scanned: usize, bytes_returned: usize) -> String {
format!(
"<Stats><BytesScanned>{}</BytesScanned><BytesProcessed>{}</BytesProcessed><BytesReturned>{}</BytesReturned></Stats>",
bytes_scanned,
bytes_scanned,
bytes_returned
)
}
fn encode_select_event(event_type: &str, payload: &[u8]) -> Vec<u8> {
let mut headers = Vec::new();
headers.extend(encode_select_header(":event-type", event_type));
if event_type == "Records" {
headers.extend(encode_select_header(":content-type", "application/octet-stream"));
} else if event_type == "Stats" {
headers.extend(encode_select_header(":content-type", "text/xml"));
}
headers.extend(encode_select_header(":message-type", "event"));
let headers_len = headers.len() as u32;
let total_len = 4 + 4 + 4 + headers.len() + payload.len() + 4;
let mut message = Vec::with_capacity(total_len);
let mut prelude = Vec::with_capacity(8);
prelude.extend((total_len as u32).to_be_bytes());
prelude.extend(headers_len.to_be_bytes());
let prelude_crc = crc32(&prelude);
message.extend(prelude);
message.extend(prelude_crc.to_be_bytes());
message.extend(headers);
message.extend(payload);
let msg_crc = crc32(&message);
message.extend(msg_crc.to_be_bytes());
message
}
fn encode_select_header(name: &str, value: &str) -> Vec<u8> {
let name_bytes = name.as_bytes();
let value_bytes = value.as_bytes();
let mut header = Vec::with_capacity(1 + name_bytes.len() + 1 + 2 + value_bytes.len());
header.push(name_bytes.len() as u8);
header.extend(name_bytes);
header.push(7);
header.extend((value_bytes.len() as u16).to_be_bytes());
header.extend(value_bytes);
header
}
fn crc32(data: &[u8]) -> u32 {
let mut hasher = Hasher::new();
hasher.update(data);
hasher.finalize()
}

View File

@@ -0,0 +1,73 @@
pub mod config;
pub mod handlers;
pub mod middleware;
pub mod services;
pub mod state;
use axum::Router;
pub const SERVER_HEADER: &str = concat!("MyFSIO-Rust/", env!("CARGO_PKG_VERSION"));
pub fn create_router(state: state::AppState) -> Router {
let mut router = Router::new()
.route("/", axum::routing::get(handlers::list_buckets))
.route(
"/{bucket}",
axum::routing::put(handlers::create_bucket)
.get(handlers::get_bucket)
.delete(handlers::delete_bucket)
.head(handlers::head_bucket)
.post(handlers::post_bucket),
)
.route(
"/{bucket}/{*key}",
axum::routing::put(handlers::put_object)
.get(handlers::get_object)
.delete(handlers::delete_object)
.head(handlers::head_object)
.post(handlers::post_object),
);
if state.config.kms_enabled {
router = router
.route("/kms/keys", axum::routing::get(handlers::kms::list_keys).post(handlers::kms::create_key))
.route("/kms/keys/{key_id}", axum::routing::get(handlers::kms::get_key).delete(handlers::kms::delete_key))
.route("/kms/keys/{key_id}/enable", axum::routing::post(handlers::kms::enable_key))
.route("/kms/keys/{key_id}/disable", axum::routing::post(handlers::kms::disable_key))
.route("/kms/encrypt", axum::routing::post(handlers::kms::encrypt))
.route("/kms/decrypt", axum::routing::post(handlers::kms::decrypt))
.route("/kms/generate-data-key", axum::routing::post(handlers::kms::generate_data_key));
}
router = router
.route("/admin/site/local", axum::routing::get(handlers::admin::get_local_site).put(handlers::admin::update_local_site))
.route("/admin/site/all", axum::routing::get(handlers::admin::list_all_sites))
.route("/admin/site/peers", axum::routing::post(handlers::admin::register_peer_site))
.route("/admin/site/peers/{site_id}", axum::routing::get(handlers::admin::get_peer_site).put(handlers::admin::update_peer_site).delete(handlers::admin::delete_peer_site))
.route("/admin/site/peers/{site_id}/health", axum::routing::post(handlers::admin::check_peer_health))
.route("/admin/site/topology", axum::routing::get(handlers::admin::get_topology))
.route("/admin/site/peers/{site_id}/bidirectional-status", axum::routing::get(handlers::admin::check_bidirectional_status))
.route("/admin/iam/users", axum::routing::get(handlers::admin::iam_list_users))
.route("/admin/iam/users/{identifier}", axum::routing::get(handlers::admin::iam_get_user))
.route("/admin/iam/users/{identifier}/policies", axum::routing::get(handlers::admin::iam_get_user_policies))
.route("/admin/iam/users/{identifier}/access-keys", axum::routing::post(handlers::admin::iam_create_access_key))
.route("/admin/iam/users/{identifier}/access-keys/{access_key}", axum::routing::delete(handlers::admin::iam_delete_access_key))
.route("/admin/iam/users/{identifier}/disable", axum::routing::post(handlers::admin::iam_disable_user))
.route("/admin/iam/users/{identifier}/enable", axum::routing::post(handlers::admin::iam_enable_user))
.route("/admin/website-domains", axum::routing::get(handlers::admin::list_website_domains).post(handlers::admin::create_website_domain))
.route("/admin/website-domains/{domain}", axum::routing::get(handlers::admin::get_website_domain).put(handlers::admin::update_website_domain).delete(handlers::admin::delete_website_domain))
.route("/admin/gc/status", axum::routing::get(handlers::admin::gc_status))
.route("/admin/gc/run", axum::routing::post(handlers::admin::gc_run))
.route("/admin/gc/history", axum::routing::get(handlers::admin::gc_history))
.route("/admin/integrity/status", axum::routing::get(handlers::admin::integrity_status))
.route("/admin/integrity/run", axum::routing::post(handlers::admin::integrity_run))
.route("/admin/integrity/history", axum::routing::get(handlers::admin::integrity_history));
router
.layer(axum::middleware::from_fn_with_state(
state.clone(),
middleware::auth_layer,
))
.layer(axum::middleware::from_fn(middleware::server_header))
.with_state(state)
}

View File

@@ -0,0 +1,97 @@
use myfsio_server::config::ServerConfig;
use myfsio_server::state::AppState;
#[tokio::main]
async fn main() {
tracing_subscriber::fmt::init();
let config = ServerConfig::from_env();
let bind_addr = config.bind_addr;
tracing::info!("MyFSIO Rust Engine starting on {}", bind_addr);
tracing::info!("Storage root: {}", config.storage_root.display());
tracing::info!("Region: {}", config.region);
tracing::info!(
"Encryption: {}, KMS: {}, GC: {}, Lifecycle: {}, Integrity: {}, Metrics: {}",
config.encryption_enabled,
config.kms_enabled,
config.gc_enabled,
config.lifecycle_enabled,
config.integrity_enabled,
config.metrics_enabled
);
let state = if config.encryption_enabled || config.kms_enabled {
AppState::new_with_encryption(config.clone()).await
} else {
AppState::new(config.clone())
};
let mut bg_handles: Vec<tokio::task::JoinHandle<()>> = Vec::new();
if let Some(ref gc) = state.gc {
bg_handles.push(gc.clone().start_background());
tracing::info!("GC background service started");
}
if let Some(ref integrity) = state.integrity {
bg_handles.push(integrity.clone().start_background());
tracing::info!("Integrity checker background service started");
}
if let Some(ref metrics) = state.metrics {
bg_handles.push(metrics.clone().start_background());
tracing::info!("Metrics collector background service started");
}
if config.lifecycle_enabled {
let lifecycle = std::sync::Arc::new(
myfsio_server::services::lifecycle::LifecycleService::new(
state.storage.clone(),
myfsio_server::services::lifecycle::LifecycleConfig::default(),
),
);
bg_handles.push(lifecycle.start_background());
tracing::info!("Lifecycle manager background service started");
}
let app = myfsio_server::create_router(state);
let listener = match tokio::net::TcpListener::bind(bind_addr).await {
Ok(listener) => listener,
Err(err) => {
if err.kind() == std::io::ErrorKind::AddrInUse {
tracing::error!("Port already in use: {}", bind_addr);
} else {
tracing::error!("Failed to bind {}: {}", bind_addr, err);
}
for handle in bg_handles {
handle.abort();
}
std::process::exit(1);
}
};
tracing::info!("Listening on {}", bind_addr);
if let Err(err) = axum::serve(listener, app)
.with_graceful_shutdown(shutdown_signal())
.await
{
tracing::error!("Server exited with error: {}", err);
for handle in bg_handles {
handle.abort();
}
std::process::exit(1);
}
for handle in bg_handles {
handle.abort();
}
}
async fn shutdown_signal() {
tokio::signal::ctrl_c()
.await
.expect("Failed to listen for Ctrl+C");
tracing::info!("Shutdown signal received");
}

View File

@@ -0,0 +1,569 @@
use axum::extract::{Request, State};
use axum::http::{Method, StatusCode};
use axum::middleware::Next;
use axum::response::{IntoResponse, Response};
use chrono::{NaiveDateTime, Utc};
use myfsio_auth::sigv4;
use myfsio_common::error::{S3Error, S3ErrorCode};
use myfsio_common::types::Principal;
use crate::state::AppState;
pub async fn auth_layer(
State(state): State<AppState>,
mut req: Request,
next: Next,
) -> Response {
let uri = req.uri().clone();
let path = uri.path().to_string();
if path == "/" && req.method() == axum::http::Method::GET {
match try_auth(&state, &req) {
AuthResult::Ok(principal) => {
if let Err(err) = authorize_request(&state, &principal, &req) {
return error_response(err, &path);
}
req.extensions_mut().insert(principal);
}
AuthResult::Denied(err) => return error_response(err, &path),
AuthResult::NoAuth => {
return error_response(
S3Error::new(S3ErrorCode::AccessDenied, "Missing credentials"),
&path,
);
}
}
return next.run(req).await;
}
match try_auth(&state, &req) {
AuthResult::Ok(principal) => {
if let Err(err) = authorize_request(&state, &principal, &req) {
return error_response(err, &path);
}
req.extensions_mut().insert(principal);
next.run(req).await
}
AuthResult::Denied(err) => error_response(err, &path),
AuthResult::NoAuth => {
error_response(
S3Error::new(S3ErrorCode::AccessDenied, "Missing credentials"),
&path,
)
}
}
}
enum AuthResult {
Ok(Principal),
Denied(S3Error),
NoAuth,
}
fn authorize_request(state: &AppState, principal: &Principal, req: &Request) -> Result<(), S3Error> {
let path = req.uri().path();
if path == "/" {
if state.iam.authorize(principal, None, "list", None) {
return Ok(());
}
return Err(S3Error::new(S3ErrorCode::AccessDenied, "Access denied"));
}
if path.starts_with("/admin/") || path.starts_with("/kms/") {
return Ok(());
}
let mut segments = path.trim_start_matches('/').split('/').filter(|s| !s.is_empty());
let bucket = match segments.next() {
Some(b) => b,
None => {
return Err(S3Error::new(S3ErrorCode::AccessDenied, "Access denied"));
}
};
let remaining: Vec<&str> = segments.collect();
let query = req.uri().query().unwrap_or("");
if remaining.is_empty() {
let action = resolve_bucket_action(req.method(), query);
if state.iam.authorize(principal, Some(bucket), action, None) {
return Ok(());
}
return Err(S3Error::new(S3ErrorCode::AccessDenied, "Access denied"));
}
let object_key = remaining.join("/");
if req.method() == Method::PUT {
if let Some(copy_source) = req
.headers()
.get("x-amz-copy-source")
.and_then(|v| v.to_str().ok())
{
let source = copy_source.strip_prefix('/').unwrap_or(copy_source);
if let Some((src_bucket, src_key)) = source.split_once('/') {
let source_allowed =
state.iam.authorize(principal, Some(src_bucket), "read", Some(src_key));
let dest_allowed =
state.iam.authorize(principal, Some(bucket), "write", Some(&object_key));
if source_allowed && dest_allowed {
return Ok(());
}
return Err(S3Error::new(S3ErrorCode::AccessDenied, "Access denied"));
}
}
}
let action = resolve_object_action(req.method(), query);
if state
.iam
.authorize(principal, Some(bucket), action, Some(&object_key))
{
return Ok(());
}
Err(S3Error::new(S3ErrorCode::AccessDenied, "Access denied"))
}
fn resolve_bucket_action(method: &Method, query: &str) -> &'static str {
if has_query_key(query, "versioning") {
return "versioning";
}
if has_query_key(query, "tagging") {
return "tagging";
}
if has_query_key(query, "cors") {
return "cors";
}
if has_query_key(query, "location") {
return "list";
}
if has_query_key(query, "encryption") {
return "encryption";
}
if has_query_key(query, "lifecycle") {
return "lifecycle";
}
if has_query_key(query, "acl") {
return "share";
}
if has_query_key(query, "policy") || has_query_key(query, "policyStatus") {
return "policy";
}
if has_query_key(query, "replication") {
return "replication";
}
if has_query_key(query, "quota") {
return "quota";
}
if has_query_key(query, "website") {
return "website";
}
if has_query_key(query, "object-lock") {
return "object_lock";
}
if has_query_key(query, "notification") {
return "notification";
}
if has_query_key(query, "logging") {
return "logging";
}
if has_query_key(query, "versions") || has_query_key(query, "uploads") {
return "list";
}
if has_query_key(query, "delete") {
return "delete";
}
match *method {
Method::GET => "list",
Method::HEAD => "read",
Method::PUT => "create_bucket",
Method::DELETE => "delete_bucket",
Method::POST => "write",
_ => "list",
}
}
fn resolve_object_action(method: &Method, query: &str) -> &'static str {
if has_query_key(query, "tagging") {
return if *method == Method::GET { "read" } else { "write" };
}
if has_query_key(query, "acl") {
return if *method == Method::GET { "read" } else { "write" };
}
if has_query_key(query, "retention") || has_query_key(query, "legal-hold") {
return "object_lock";
}
if has_query_key(query, "attributes") {
return "read";
}
if has_query_key(query, "uploads") || has_query_key(query, "uploadId") {
return match *method {
Method::GET => "read",
_ => "write",
};
}
if has_query_key(query, "select") {
return "read";
}
match *method {
Method::GET | Method::HEAD => "read",
Method::PUT => "write",
Method::DELETE => "delete",
Method::POST => "write",
_ => "read",
}
}
fn has_query_key(query: &str, key: &str) -> bool {
if query.is_empty() {
return false;
}
query
.split('&')
.filter(|part| !part.is_empty())
.any(|part| part == key || part.starts_with(&format!("{}=", key)))
}
fn try_auth(state: &AppState, req: &Request) -> AuthResult {
if let Some(auth_header) = req.headers().get("authorization") {
if let Ok(auth_str) = auth_header.to_str() {
if auth_str.starts_with("AWS4-HMAC-SHA256 ") {
return verify_sigv4_header(state, req, auth_str);
}
}
}
let query = req.uri().query().unwrap_or("");
if query.contains("X-Amz-Algorithm=AWS4-HMAC-SHA256") {
return verify_sigv4_query(state, req);
}
if let (Some(ak), Some(sk)) = (
req.headers().get("x-access-key").and_then(|v| v.to_str().ok()),
req.headers().get("x-secret-key").and_then(|v| v.to_str().ok()),
) {
return match state.iam.authenticate(ak, sk) {
Some(principal) => AuthResult::Ok(principal),
None => AuthResult::Denied(
S3Error::from_code(S3ErrorCode::SignatureDoesNotMatch),
),
};
}
AuthResult::NoAuth
}
fn verify_sigv4_header(state: &AppState, req: &Request, auth_str: &str) -> AuthResult {
let parts: Vec<&str> = auth_str
.strip_prefix("AWS4-HMAC-SHA256 ")
.unwrap()
.split(", ")
.collect();
if parts.len() != 3 {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Malformed Authorization header"),
);
}
let credential = parts[0].strip_prefix("Credential=").unwrap_or("");
let signed_headers_str = parts[1].strip_prefix("SignedHeaders=").unwrap_or("");
let provided_signature = parts[2].strip_prefix("Signature=").unwrap_or("");
let cred_parts: Vec<&str> = credential.split('/').collect();
if cred_parts.len() != 5 {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Malformed credential"),
);
}
let access_key = cred_parts[0];
let date_stamp = cred_parts[1];
let region = cred_parts[2];
let service = cred_parts[3];
let amz_date = req
.headers()
.get("x-amz-date")
.or_else(|| req.headers().get("date"))
.and_then(|v| v.to_str().ok())
.unwrap_or("");
if amz_date.is_empty() {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::AccessDenied, "Missing Date header"),
);
}
if let Some(err) = check_timestamp_freshness(amz_date, state.config.sigv4_timestamp_tolerance_secs) {
return AuthResult::Denied(err);
}
let secret_key = match state.iam.get_secret_key(access_key) {
Some(sk) => sk,
None => {
return AuthResult::Denied(
S3Error::from_code(S3ErrorCode::InvalidAccessKeyId),
);
}
};
let method = req.method().as_str();
let canonical_uri = req.uri().path();
let query_params = parse_query_params(req.uri().query().unwrap_or(""));
let payload_hash = req
.headers()
.get("x-amz-content-sha256")
.and_then(|v| v.to_str().ok())
.unwrap_or("UNSIGNED-PAYLOAD");
let signed_headers: Vec<&str> = signed_headers_str.split(';').collect();
let header_values: Vec<(String, String)> = signed_headers
.iter()
.map(|&name| {
let value = req
.headers()
.get(name)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
(name.to_string(), value.to_string())
})
.collect();
let verified = sigv4::verify_sigv4_signature(
method,
canonical_uri,
&query_params,
signed_headers_str,
&header_values,
payload_hash,
amz_date,
date_stamp,
region,
service,
&secret_key,
provided_signature,
);
if !verified {
return AuthResult::Denied(
S3Error::from_code(S3ErrorCode::SignatureDoesNotMatch),
);
}
match state.iam.get_principal(access_key) {
Some(p) => AuthResult::Ok(p),
None => AuthResult::Denied(
S3Error::from_code(S3ErrorCode::InvalidAccessKeyId),
),
}
}
fn verify_sigv4_query(state: &AppState, req: &Request) -> AuthResult {
let query = req.uri().query().unwrap_or("");
let params = parse_query_params(query);
let param_map: std::collections::HashMap<&str, &str> = params
.iter()
.map(|(k, v)| (k.as_str(), v.as_str()))
.collect();
let credential = match param_map.get("X-Amz-Credential") {
Some(c) => *c,
None => {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Missing X-Amz-Credential"),
);
}
};
let signed_headers_str = param_map
.get("X-Amz-SignedHeaders")
.copied()
.unwrap_or("host");
let provided_signature = match param_map.get("X-Amz-Signature") {
Some(s) => *s,
None => {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Missing X-Amz-Signature"),
);
}
};
let amz_date = match param_map.get("X-Amz-Date") {
Some(d) => *d,
None => {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Missing X-Amz-Date"),
);
}
};
let expires_str = match param_map.get("X-Amz-Expires") {
Some(e) => *e,
None => {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Missing X-Amz-Expires"),
);
}
};
let cred_parts: Vec<&str> = credential.split('/').collect();
if cred_parts.len() != 5 {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Malformed credential"),
);
}
let access_key = cred_parts[0];
let date_stamp = cred_parts[1];
let region = cred_parts[2];
let service = cred_parts[3];
let expires: u64 = match expires_str.parse() {
Ok(e) => e,
Err(_) => {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "Invalid X-Amz-Expires"),
);
}
};
if expires < state.config.presigned_url_min_expiry
|| expires > state.config.presigned_url_max_expiry
{
return AuthResult::Denied(
S3Error::new(S3ErrorCode::InvalidArgument, "X-Amz-Expires out of range"),
);
}
if let Ok(request_time) =
NaiveDateTime::parse_from_str(amz_date, "%Y%m%dT%H%M%SZ")
{
let request_utc = request_time.and_utc();
let now = Utc::now();
let elapsed = (now - request_utc).num_seconds();
if elapsed > expires as i64 {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::AccessDenied, "Request has expired"),
);
}
if elapsed < -(state.config.sigv4_timestamp_tolerance_secs as i64) {
return AuthResult::Denied(
S3Error::new(S3ErrorCode::AccessDenied, "Request is too far in the future"),
);
}
}
let secret_key = match state.iam.get_secret_key(access_key) {
Some(sk) => sk,
None => {
return AuthResult::Denied(
S3Error::from_code(S3ErrorCode::InvalidAccessKeyId),
);
}
};
let method = req.method().as_str();
let canonical_uri = req.uri().path();
let query_params_no_sig: Vec<(String, String)> = params
.iter()
.filter(|(k, _)| k != "X-Amz-Signature")
.cloned()
.collect();
let payload_hash = "UNSIGNED-PAYLOAD";
let signed_headers: Vec<&str> = signed_headers_str.split(';').collect();
let header_values: Vec<(String, String)> = signed_headers
.iter()
.map(|&name| {
let value = req
.headers()
.get(name)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
(name.to_string(), value.to_string())
})
.collect();
let verified = sigv4::verify_sigv4_signature(
method,
canonical_uri,
&query_params_no_sig,
signed_headers_str,
&header_values,
payload_hash,
amz_date,
date_stamp,
region,
service,
&secret_key,
provided_signature,
);
if !verified {
return AuthResult::Denied(
S3Error::from_code(S3ErrorCode::SignatureDoesNotMatch),
);
}
match state.iam.get_principal(access_key) {
Some(p) => AuthResult::Ok(p),
None => AuthResult::Denied(
S3Error::from_code(S3ErrorCode::InvalidAccessKeyId),
),
}
}
fn check_timestamp_freshness(amz_date: &str, tolerance_secs: u64) -> Option<S3Error> {
let request_time = NaiveDateTime::parse_from_str(amz_date, "%Y%m%dT%H%M%SZ").ok()?;
let request_utc = request_time.and_utc();
let now = Utc::now();
let diff = (now - request_utc).num_seconds().unsigned_abs();
if diff > tolerance_secs {
return Some(S3Error::new(
S3ErrorCode::AccessDenied,
"Request timestamp too old or too far in the future",
));
}
None
}
fn parse_query_params(query: &str) -> Vec<(String, String)> {
if query.is_empty() {
return Vec::new();
}
query
.split('&')
.filter_map(|pair| {
let mut parts = pair.splitn(2, '=');
let key = parts.next()?;
let value = parts.next().unwrap_or("");
Some((
urlencoding_decode(key),
urlencoding_decode(value),
))
})
.collect()
}
fn urlencoding_decode(s: &str) -> String {
percent_encoding::percent_decode_str(s)
.decode_utf8_lossy()
.into_owned()
}
fn error_response(err: S3Error, resource: &str) -> Response {
let status =
StatusCode::from_u16(err.http_status()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
let request_id = uuid::Uuid::new_v4().simple().to_string();
let body = err
.with_resource(resource.to_string())
.with_request_id(request_id)
.to_xml();
(status, [("content-type", "application/xml")], body).into_response()
}

View File

@@ -0,0 +1,16 @@
mod auth;
pub use auth::auth_layer;
use axum::extract::Request;
use axum::middleware::Next;
use axum::response::Response;
pub async fn server_header(req: Request, next: Next) -> Response {
let mut resp = next.run(req).await;
resp.headers_mut().insert(
"server",
crate::SERVER_HEADER.parse().unwrap(),
);
resp
}

View File

@@ -0,0 +1,263 @@
use serde_json::{json, Value};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use tokio::sync::RwLock;
pub struct GcConfig {
pub interval_hours: f64,
pub temp_file_max_age_hours: f64,
pub multipart_max_age_days: u64,
pub lock_file_max_age_hours: f64,
pub dry_run: bool,
}
impl Default for GcConfig {
fn default() -> Self {
Self {
interval_hours: 6.0,
temp_file_max_age_hours: 24.0,
multipart_max_age_days: 7,
lock_file_max_age_hours: 1.0,
dry_run: false,
}
}
}
pub struct GcService {
storage_root: PathBuf,
config: GcConfig,
running: Arc<RwLock<bool>>,
history: Arc<RwLock<Vec<Value>>>,
history_path: PathBuf,
}
impl GcService {
pub fn new(storage_root: PathBuf, config: GcConfig) -> Self {
let history_path = storage_root
.join(".myfsio.sys")
.join("config")
.join("gc_history.json");
let history = if history_path.exists() {
std::fs::read_to_string(&history_path)
.ok()
.and_then(|s| serde_json::from_str::<Value>(&s).ok())
.and_then(|v| v.get("executions").and_then(|e| e.as_array().cloned()))
.unwrap_or_default()
} else {
Vec::new()
};
Self {
storage_root,
config,
running: Arc::new(RwLock::new(false)),
history: Arc::new(RwLock::new(history)),
history_path,
}
}
pub async fn status(&self) -> Value {
let running = *self.running.read().await;
json!({
"enabled": true,
"running": running,
"interval_hours": self.config.interval_hours,
"temp_file_max_age_hours": self.config.temp_file_max_age_hours,
"multipart_max_age_days": self.config.multipart_max_age_days,
"lock_file_max_age_hours": self.config.lock_file_max_age_hours,
"dry_run": self.config.dry_run,
})
}
pub async fn history(&self) -> Value {
let history = self.history.read().await;
json!({ "executions": *history })
}
pub async fn run_now(&self, dry_run: bool) -> Result<Value, String> {
{
let mut running = self.running.write().await;
if *running {
return Err("GC already running".to_string());
}
*running = true;
}
let start = Instant::now();
let result = self.execute_gc(dry_run || self.config.dry_run).await;
let elapsed = start.elapsed().as_secs_f64();
*self.running.write().await = false;
let mut result_json = result.clone();
if let Some(obj) = result_json.as_object_mut() {
obj.insert("execution_time_seconds".to_string(), json!(elapsed));
}
let record = json!({
"timestamp": chrono::Utc::now().timestamp_millis() as f64 / 1000.0,
"dry_run": dry_run || self.config.dry_run,
"result": result_json,
});
{
let mut history = self.history.write().await;
history.push(record);
if history.len() > 50 {
let excess = history.len() - 50;
history.drain(..excess);
}
}
self.save_history().await;
Ok(result)
}
async fn execute_gc(&self, dry_run: bool) -> Value {
let mut temp_files_deleted = 0u64;
let mut temp_bytes_freed = 0u64;
let mut multipart_uploads_deleted = 0u64;
let mut lock_files_deleted = 0u64;
let mut empty_dirs_removed = 0u64;
let mut errors: Vec<String> = Vec::new();
let now = std::time::SystemTime::now();
let temp_max_age = std::time::Duration::from_secs_f64(self.config.temp_file_max_age_hours * 3600.0);
let multipart_max_age = std::time::Duration::from_secs(self.config.multipart_max_age_days * 86400);
let lock_max_age = std::time::Duration::from_secs_f64(self.config.lock_file_max_age_hours * 3600.0);
let tmp_dir = self.storage_root.join(".myfsio.sys").join("tmp");
if tmp_dir.exists() {
match std::fs::read_dir(&tmp_dir) {
Ok(entries) => {
for entry in entries.flatten() {
if let Ok(metadata) = entry.metadata() {
if let Ok(modified) = metadata.modified() {
if let Ok(age) = now.duration_since(modified) {
if age > temp_max_age {
let size = metadata.len();
if !dry_run {
if let Err(e) = std::fs::remove_file(entry.path()) {
errors.push(format!("Failed to remove temp file: {}", e));
continue;
}
}
temp_files_deleted += 1;
temp_bytes_freed += size;
}
}
}
}
}
}
Err(e) => errors.push(format!("Failed to read tmp dir: {}", e)),
}
}
let multipart_dir = self.storage_root.join(".myfsio.sys").join("multipart");
if multipart_dir.exists() {
if let Ok(bucket_dirs) = std::fs::read_dir(&multipart_dir) {
for bucket_entry in bucket_dirs.flatten() {
if let Ok(uploads) = std::fs::read_dir(bucket_entry.path()) {
for upload in uploads.flatten() {
if let Ok(metadata) = upload.metadata() {
if let Ok(modified) = metadata.modified() {
if let Ok(age) = now.duration_since(modified) {
if age > multipart_max_age {
if !dry_run {
let _ = std::fs::remove_dir_all(upload.path());
}
multipart_uploads_deleted += 1;
}
}
}
}
}
}
}
}
}
let buckets_dir = self.storage_root.join(".myfsio.sys").join("buckets");
if buckets_dir.exists() {
if let Ok(bucket_dirs) = std::fs::read_dir(&buckets_dir) {
for bucket_entry in bucket_dirs.flatten() {
let locks_dir = bucket_entry.path().join("locks");
if locks_dir.exists() {
if let Ok(locks) = std::fs::read_dir(&locks_dir) {
for lock in locks.flatten() {
if let Ok(metadata) = lock.metadata() {
if let Ok(modified) = metadata.modified() {
if let Ok(age) = now.duration_since(modified) {
if age > lock_max_age {
if !dry_run {
let _ = std::fs::remove_file(lock.path());
}
lock_files_deleted += 1;
}
}
}
}
}
}
}
}
}
}
if !dry_run {
for dir in [&tmp_dir, &multipart_dir] {
if dir.exists() {
if let Ok(entries) = std::fs::read_dir(dir) {
for entry in entries.flatten() {
if entry.path().is_dir() {
if let Ok(mut contents) = std::fs::read_dir(entry.path()) {
if contents.next().is_none() {
let _ = std::fs::remove_dir(entry.path());
empty_dirs_removed += 1;
}
}
}
}
}
}
}
}
json!({
"temp_files_deleted": temp_files_deleted,
"temp_bytes_freed": temp_bytes_freed,
"multipart_uploads_deleted": multipart_uploads_deleted,
"lock_files_deleted": lock_files_deleted,
"empty_dirs_removed": empty_dirs_removed,
"errors": errors,
})
}
async fn save_history(&self) {
let history = self.history.read().await;
let data = json!({ "executions": *history });
if let Some(parent) = self.history_path.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::write(&self.history_path, serde_json::to_string_pretty(&data).unwrap_or_default());
}
pub fn start_background(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
let interval = std::time::Duration::from_secs_f64(self.config.interval_hours * 3600.0);
tokio::spawn(async move {
let mut timer = tokio::time::interval(interval);
timer.tick().await;
loop {
timer.tick().await;
tracing::info!("GC cycle starting");
match self.run_now(false).await {
Ok(result) => tracing::info!("GC cycle complete: {:?}", result),
Err(e) => tracing::warn!("GC cycle failed: {}", e),
}
}
})
}
}

View File

@@ -0,0 +1,204 @@
use myfsio_storage::fs_backend::FsStorageBackend;
use myfsio_storage::traits::StorageEngine;
use serde_json::{json, Value};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use tokio::sync::RwLock;
pub struct IntegrityConfig {
pub interval_hours: f64,
pub batch_size: usize,
pub auto_heal: bool,
pub dry_run: bool,
}
impl Default for IntegrityConfig {
fn default() -> Self {
Self {
interval_hours: 24.0,
batch_size: 1000,
auto_heal: false,
dry_run: false,
}
}
}
pub struct IntegrityService {
storage: Arc<FsStorageBackend>,
config: IntegrityConfig,
running: Arc<RwLock<bool>>,
history: Arc<RwLock<Vec<Value>>>,
history_path: PathBuf,
}
impl IntegrityService {
pub fn new(
storage: Arc<FsStorageBackend>,
storage_root: &std::path::Path,
config: IntegrityConfig,
) -> Self {
let history_path = storage_root
.join(".myfsio.sys")
.join("config")
.join("integrity_history.json");
let history = if history_path.exists() {
std::fs::read_to_string(&history_path)
.ok()
.and_then(|s| serde_json::from_str::<Value>(&s).ok())
.and_then(|v| v.get("executions").and_then(|e| e.as_array().cloned()))
.unwrap_or_default()
} else {
Vec::new()
};
Self {
storage,
config,
running: Arc::new(RwLock::new(false)),
history: Arc::new(RwLock::new(history)),
history_path,
}
}
pub async fn status(&self) -> Value {
let running = *self.running.read().await;
json!({
"enabled": true,
"running": running,
"interval_hours": self.config.interval_hours,
"batch_size": self.config.batch_size,
"auto_heal": self.config.auto_heal,
"dry_run": self.config.dry_run,
})
}
pub async fn history(&self) -> Value {
let history = self.history.read().await;
json!({ "executions": *history })
}
pub async fn run_now(&self, dry_run: bool, auto_heal: bool) -> Result<Value, String> {
{
let mut running = self.running.write().await;
if *running {
return Err("Integrity check already running".to_string());
}
*running = true;
}
let start = Instant::now();
let result = self.check_integrity(dry_run, auto_heal).await;
let elapsed = start.elapsed().as_secs_f64();
*self.running.write().await = false;
let mut result_json = result.clone();
if let Some(obj) = result_json.as_object_mut() {
obj.insert("execution_time_seconds".to_string(), json!(elapsed));
}
let record = json!({
"timestamp": chrono::Utc::now().timestamp_millis() as f64 / 1000.0,
"dry_run": dry_run,
"auto_heal": auto_heal,
"result": result_json,
});
{
let mut history = self.history.write().await;
history.push(record);
if history.len() > 50 {
let excess = history.len() - 50;
history.drain(..excess);
}
}
self.save_history().await;
Ok(result)
}
async fn check_integrity(&self, _dry_run: bool, _auto_heal: bool) -> Value {
let buckets = match self.storage.list_buckets().await {
Ok(b) => b,
Err(e) => return json!({"error": e.to_string()}),
};
let mut objects_scanned = 0u64;
let mut corrupted = 0u64;
let mut phantom_metadata = 0u64;
let mut errors: Vec<String> = Vec::new();
for bucket in &buckets {
let params = myfsio_common::types::ListParams {
max_keys: self.config.batch_size,
..Default::default()
};
let objects = match self.storage.list_objects(&bucket.name, &params).await {
Ok(r) => r.objects,
Err(e) => {
errors.push(format!("{}: {}", bucket.name, e));
continue;
}
};
for obj in &objects {
objects_scanned += 1;
match self.storage.get_object_path(&bucket.name, &obj.key).await {
Ok(path) => {
if !path.exists() {
phantom_metadata += 1;
} else if let Some(ref expected_etag) = obj.etag {
match myfsio_crypto::hashing::md5_file(&path) {
Ok(actual_etag) => {
if &actual_etag != expected_etag {
corrupted += 1;
}
}
Err(e) => errors.push(format!("{}:{}: {}", bucket.name, obj.key, e)),
}
}
}
Err(e) => errors.push(format!("{}:{}: {}", bucket.name, obj.key, e)),
}
}
}
json!({
"objects_scanned": objects_scanned,
"buckets_scanned": buckets.len(),
"corrupted_objects": corrupted,
"phantom_metadata": phantom_metadata,
"errors": errors,
})
}
async fn save_history(&self) {
let history = self.history.read().await;
let data = json!({ "executions": *history });
if let Some(parent) = self.history_path.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::write(
&self.history_path,
serde_json::to_string_pretty(&data).unwrap_or_default(),
);
}
pub fn start_background(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
let interval = std::time::Duration::from_secs_f64(self.config.interval_hours * 3600.0);
tokio::spawn(async move {
let mut timer = tokio::time::interval(interval);
timer.tick().await;
loop {
timer.tick().await;
tracing::info!("Integrity check starting");
match self.run_now(false, false).await {
Ok(result) => tracing::info!("Integrity check complete: {:?}", result),
Err(e) => tracing::warn!("Integrity check failed: {}", e),
}
}
})
}
}

View File

@@ -0,0 +1,153 @@
use myfsio_storage::fs_backend::FsStorageBackend;
use myfsio_storage::traits::StorageEngine;
use serde_json::{json, Value};
use std::sync::Arc;
use tokio::sync::RwLock;
pub struct LifecycleConfig {
pub interval_seconds: u64,
}
impl Default for LifecycleConfig {
fn default() -> Self {
Self {
interval_seconds: 3600,
}
}
}
pub struct LifecycleService {
storage: Arc<FsStorageBackend>,
config: LifecycleConfig,
running: Arc<RwLock<bool>>,
}
impl LifecycleService {
pub fn new(storage: Arc<FsStorageBackend>, config: LifecycleConfig) -> Self {
Self {
storage,
config,
running: Arc::new(RwLock::new(false)),
}
}
pub async fn run_cycle(&self) -> Result<Value, String> {
{
let mut running = self.running.write().await;
if *running {
return Err("Lifecycle already running".to_string());
}
*running = true;
}
let result = self.evaluate_rules().await;
*self.running.write().await = false;
Ok(result)
}
async fn evaluate_rules(&self) -> Value {
let buckets = match self.storage.list_buckets().await {
Ok(b) => b,
Err(e) => return json!({"error": e.to_string()}),
};
let mut total_expired = 0u64;
let mut total_multipart_aborted = 0u64;
let mut errors: Vec<String> = Vec::new();
for bucket in &buckets {
let config = match self.storage.get_bucket_config(&bucket.name).await {
Ok(c) => c,
Err(_) => continue,
};
let lifecycle = match &config.lifecycle {
Some(lc) => lc,
None => continue,
};
let rules = match lifecycle.as_str().and_then(|s| serde_json::from_str::<Value>(s).ok()) {
Some(v) => v,
None => continue,
};
let rules_arr = match rules.get("Rules").and_then(|r| r.as_array()) {
Some(a) => a.clone(),
None => continue,
};
for rule in &rules_arr {
if rule.get("Status").and_then(|s| s.as_str()) != Some("Enabled") {
continue;
}
let prefix = rule
.get("Filter")
.and_then(|f| f.get("Prefix"))
.and_then(|p| p.as_str())
.or_else(|| rule.get("Prefix").and_then(|p| p.as_str()))
.unwrap_or("");
if let Some(exp) = rule.get("Expiration") {
if let Some(days) = exp.get("Days").and_then(|d| d.as_u64()) {
let cutoff = chrono::Utc::now() - chrono::Duration::days(days as i64);
let params = myfsio_common::types::ListParams {
max_keys: 1000,
prefix: if prefix.is_empty() { None } else { Some(prefix.to_string()) },
..Default::default()
};
if let Ok(result) = self.storage.list_objects(&bucket.name, &params).await {
for obj in &result.objects {
if obj.last_modified < cutoff {
match self.storage.delete_object(&bucket.name, &obj.key).await {
Ok(()) => total_expired += 1,
Err(e) => errors.push(format!("{}:{}: {}", bucket.name, obj.key, e)),
}
}
}
}
}
}
if let Some(abort) = rule.get("AbortIncompleteMultipartUpload") {
if let Some(days) = abort.get("DaysAfterInitiation").and_then(|d| d.as_u64()) {
let cutoff = chrono::Utc::now() - chrono::Duration::days(days as i64);
if let Ok(uploads) = self.storage.list_multipart_uploads(&bucket.name).await {
for upload in &uploads {
if upload.initiated < cutoff {
match self.storage.abort_multipart(&bucket.name, &upload.upload_id).await {
Ok(()) => total_multipart_aborted += 1,
Err(e) => errors.push(format!("abort {}: {}", upload.upload_id, e)),
}
}
}
}
}
}
}
}
json!({
"objects_expired": total_expired,
"multipart_aborted": total_multipart_aborted,
"buckets_evaluated": buckets.len(),
"errors": errors,
})
}
pub fn start_background(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
let interval = std::time::Duration::from_secs(self.config.interval_seconds);
tokio::spawn(async move {
let mut timer = tokio::time::interval(interval);
timer.tick().await;
loop {
timer.tick().await;
tracing::info!("Lifecycle evaluation starting");
match self.run_cycle().await {
Ok(result) => tracing::info!("Lifecycle cycle complete: {:?}", result),
Err(e) => tracing::warn!("Lifecycle cycle failed: {}", e),
}
}
})
}
}

View File

@@ -0,0 +1,219 @@
use serde_json::{json, Value};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use tokio::sync::RwLock;
pub struct MetricsConfig {
pub interval_minutes: u64,
pub retention_hours: u64,
}
impl Default for MetricsConfig {
fn default() -> Self {
Self {
interval_minutes: 5,
retention_hours: 24,
}
}
}
struct MethodStats {
count: u64,
success_count: u64,
error_count: u64,
bytes_in: u64,
bytes_out: u64,
latencies: Vec<f64>,
}
impl MethodStats {
fn new() -> Self {
Self {
count: 0,
success_count: 0,
error_count: 0,
bytes_in: 0,
bytes_out: 0,
latencies: Vec::new(),
}
}
fn to_json(&self) -> Value {
let (min, max, avg, p50, p95, p99) = if self.latencies.is_empty() {
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
} else {
let mut sorted = self.latencies.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let len = sorted.len();
let sum: f64 = sorted.iter().sum();
(
sorted[0],
sorted[len - 1],
sum / len as f64,
sorted[len / 2],
sorted[((len as f64 * 0.95) as usize).min(len - 1)],
sorted[((len as f64 * 0.99) as usize).min(len - 1)],
)
};
json!({
"count": self.count,
"success_count": self.success_count,
"error_count": self.error_count,
"bytes_in": self.bytes_in,
"bytes_out": self.bytes_out,
"latency_min_ms": min,
"latency_max_ms": max,
"latency_avg_ms": avg,
"latency_p50_ms": p50,
"latency_p95_ms": p95,
"latency_p99_ms": p99,
})
}
}
struct CurrentWindow {
by_method: HashMap<String, MethodStats>,
by_status_class: HashMap<String, u64>,
start_time: Instant,
}
impl CurrentWindow {
fn new() -> Self {
Self {
by_method: HashMap::new(),
by_status_class: HashMap::new(),
start_time: Instant::now(),
}
}
fn reset(&mut self) {
self.by_method.clear();
self.by_status_class.clear();
self.start_time = Instant::now();
}
}
pub struct MetricsService {
config: MetricsConfig,
current: Arc<RwLock<CurrentWindow>>,
snapshots: Arc<RwLock<Vec<Value>>>,
snapshots_path: PathBuf,
}
impl MetricsService {
pub fn new(storage_root: &std::path::Path, config: MetricsConfig) -> Self {
let snapshots_path = storage_root
.join(".myfsio.sys")
.join("config")
.join("operation_metrics.json");
let snapshots = if snapshots_path.exists() {
std::fs::read_to_string(&snapshots_path)
.ok()
.and_then(|s| serde_json::from_str::<Value>(&s).ok())
.and_then(|v| v.get("snapshots").and_then(|s| s.as_array().cloned()))
.unwrap_or_default()
} else {
Vec::new()
};
Self {
config,
current: Arc::new(RwLock::new(CurrentWindow::new())),
snapshots: Arc::new(RwLock::new(snapshots)),
snapshots_path,
}
}
pub async fn record(&self, method: &str, status: u16, latency_ms: f64, bytes_in: u64, bytes_out: u64) {
let mut window = self.current.write().await;
let stats = window.by_method.entry(method.to_string()).or_insert_with(MethodStats::new);
stats.count += 1;
if status < 400 {
stats.success_count += 1;
} else {
stats.error_count += 1;
}
stats.bytes_in += bytes_in;
stats.bytes_out += bytes_out;
stats.latencies.push(latency_ms);
let class = format!("{}xx", status / 100);
*window.by_status_class.entry(class).or_insert(0) += 1;
}
pub async fn snapshot(&self) -> Value {
let window = self.current.read().await;
let mut by_method = serde_json::Map::new();
for (method, stats) in &window.by_method {
by_method.insert(method.clone(), stats.to_json());
}
let snapshots = self.snapshots.read().await;
json!({
"enabled": true,
"current_window": {
"by_method": by_method,
"by_status_class": window.by_status_class,
"window_start_elapsed_secs": window.start_time.elapsed().as_secs_f64(),
},
"snapshots": *snapshots,
})
}
async fn flush_window(&self) {
let snap = {
let mut window = self.current.write().await;
let mut by_method = serde_json::Map::new();
for (method, stats) in &window.by_method {
by_method.insert(method.clone(), stats.to_json());
}
let snap = json!({
"timestamp": chrono::Utc::now().to_rfc3339(),
"window_seconds": self.config.interval_minutes * 60,
"by_method": by_method,
"by_status_class": window.by_status_class,
});
window.reset();
snap
};
let max_snapshots = (self.config.retention_hours * 60 / self.config.interval_minutes) as usize;
{
let mut snapshots = self.snapshots.write().await;
snapshots.push(snap);
if snapshots.len() > max_snapshots {
let excess = snapshots.len() - max_snapshots;
snapshots.drain(..excess);
}
}
self.save_snapshots().await;
}
async fn save_snapshots(&self) {
let snapshots = self.snapshots.read().await;
let data = json!({ "snapshots": *snapshots });
if let Some(parent) = self.snapshots_path.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::write(
&self.snapshots_path,
serde_json::to_string_pretty(&data).unwrap_or_default(),
);
}
pub fn start_background(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
let interval = std::time::Duration::from_secs(self.config.interval_minutes * 60);
tokio::spawn(async move {
let mut timer = tokio::time::interval(interval);
timer.tick().await;
loop {
timer.tick().await;
self.flush_window().await;
}
})
}
}

View File

@@ -0,0 +1,6 @@
pub mod gc;
pub mod lifecycle;
pub mod integrity;
pub mod metrics;
pub mod site_registry;
pub mod website_domains;

View File

@@ -0,0 +1,143 @@
use chrono::Utc;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::sync::Arc;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SiteInfo {
pub site_id: String,
pub endpoint: String,
#[serde(default = "default_region")]
pub region: String,
#[serde(default = "default_priority")]
pub priority: i32,
#[serde(default)]
pub display_name: String,
#[serde(default)]
pub created_at: Option<String>,
}
fn default_region() -> String {
"us-east-1".to_string()
}
fn default_priority() -> i32 {
100
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PeerSite {
pub site_id: String,
pub endpoint: String,
#[serde(default = "default_region")]
pub region: String,
#[serde(default = "default_priority")]
pub priority: i32,
#[serde(default)]
pub display_name: String,
#[serde(default)]
pub connection_id: Option<String>,
#[serde(default)]
pub created_at: Option<String>,
#[serde(default)]
pub is_healthy: bool,
#[serde(default)]
pub last_health_check: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
struct RegistryData {
#[serde(default)]
local: Option<SiteInfo>,
#[serde(default)]
peers: Vec<PeerSite>,
}
pub struct SiteRegistry {
path: PathBuf,
data: Arc<RwLock<RegistryData>>,
}
impl SiteRegistry {
pub fn new(storage_root: &std::path::Path) -> Self {
let path = storage_root
.join(".myfsio.sys")
.join("config")
.join("site_registry.json");
let data = if path.exists() {
std::fs::read_to_string(&path)
.ok()
.and_then(|s| serde_json::from_str(&s).ok())
.unwrap_or_default()
} else {
RegistryData::default()
};
Self {
path,
data: Arc::new(RwLock::new(data)),
}
}
fn save(&self) {
let data = self.data.read();
if let Some(parent) = self.path.parent() {
let _ = std::fs::create_dir_all(parent);
}
if let Ok(json) = serde_json::to_string_pretty(&*data) {
let _ = std::fs::write(&self.path, json);
}
}
pub fn get_local_site(&self) -> Option<SiteInfo> {
self.data.read().local.clone()
}
pub fn set_local_site(&self, site: SiteInfo) {
self.data.write().local = Some(site);
self.save();
}
pub fn list_peers(&self) -> Vec<PeerSite> {
self.data.read().peers.clone()
}
pub fn get_peer(&self, site_id: &str) -> Option<PeerSite> {
self.data.read().peers.iter().find(|p| p.site_id == site_id).cloned()
}
pub fn add_peer(&self, peer: PeerSite) {
self.data.write().peers.push(peer);
self.save();
}
pub fn update_peer(&self, peer: PeerSite) {
let mut data = self.data.write();
if let Some(existing) = data.peers.iter_mut().find(|p| p.site_id == peer.site_id) {
*existing = peer;
}
drop(data);
self.save();
}
pub fn delete_peer(&self, site_id: &str) -> bool {
let mut data = self.data.write();
let len_before = data.peers.len();
data.peers.retain(|p| p.site_id != site_id);
let removed = data.peers.len() < len_before;
drop(data);
if removed {
self.save();
}
removed
}
pub fn update_health(&self, site_id: &str, is_healthy: bool) {
let mut data = self.data.write();
if let Some(peer) = data.peers.iter_mut().find(|p| p.site_id == site_id) {
peer.is_healthy = is_healthy;
peer.last_health_check = Some(Utc::now().to_rfc3339());
}
drop(data);
self.save();
}
}

View File

@@ -0,0 +1,104 @@
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
struct DomainData {
#[serde(default)]
mappings: HashMap<String, String>,
}
pub struct WebsiteDomainStore {
path: PathBuf,
data: Arc<RwLock<DomainData>>,
}
impl WebsiteDomainStore {
pub fn new(storage_root: &std::path::Path) -> Self {
let path = storage_root
.join(".myfsio.sys")
.join("config")
.join("website_domains.json");
let data = if path.exists() {
std::fs::read_to_string(&path)
.ok()
.and_then(|s| serde_json::from_str(&s).ok())
.unwrap_or_default()
} else {
DomainData::default()
};
Self {
path,
data: Arc::new(RwLock::new(data)),
}
}
fn save(&self) {
let data = self.data.read();
if let Some(parent) = self.path.parent() {
let _ = std::fs::create_dir_all(parent);
}
if let Ok(json) = serde_json::to_string_pretty(&*data) {
let _ = std::fs::write(&self.path, json);
}
}
pub fn list_all(&self) -> Vec<serde_json::Value> {
self.data
.read()
.mappings
.iter()
.map(|(domain, bucket)| {
serde_json::json!({
"domain": domain,
"bucket": bucket,
})
})
.collect()
}
pub fn get_bucket(&self, domain: &str) -> Option<String> {
self.data.read().mappings.get(domain).cloned()
}
pub fn set_mapping(&self, domain: &str, bucket: &str) {
self.data.write().mappings.insert(domain.to_string(), bucket.to_string());
self.save();
}
pub fn delete_mapping(&self, domain: &str) -> bool {
let removed = self.data.write().mappings.remove(domain).is_some();
if removed {
self.save();
}
removed
}
}
pub fn normalize_domain(domain: &str) -> String {
domain.trim().to_ascii_lowercase()
}
pub fn is_valid_domain(domain: &str) -> bool {
if domain.is_empty() || domain.len() > 253 {
return false;
}
let labels: Vec<&str> = domain.split('.').collect();
if labels.len() < 2 {
return false;
}
for label in &labels {
if label.is_empty() || label.len() > 63 {
return false;
}
if !label.chars().all(|c| c.is_ascii_alphanumeric() || c == '-') {
return false;
}
if label.starts_with('-') || label.ends_with('-') {
return false;
}
}
true
}

View File

@@ -0,0 +1,121 @@
use std::sync::Arc;
use crate::config::ServerConfig;
use crate::services::gc::GcService;
use crate::services::integrity::IntegrityService;
use crate::services::metrics::MetricsService;
use crate::services::site_registry::SiteRegistry;
use crate::services::website_domains::WebsiteDomainStore;
use myfsio_auth::iam::IamService;
use myfsio_crypto::encryption::EncryptionService;
use myfsio_crypto::kms::KmsService;
use myfsio_storage::fs_backend::FsStorageBackend;
#[derive(Clone)]
pub struct AppState {
pub config: ServerConfig,
pub storage: Arc<FsStorageBackend>,
pub iam: Arc<IamService>,
pub encryption: Option<Arc<EncryptionService>>,
pub kms: Option<Arc<KmsService>>,
pub gc: Option<Arc<GcService>>,
pub integrity: Option<Arc<IntegrityService>>,
pub metrics: Option<Arc<MetricsService>>,
pub site_registry: Option<Arc<SiteRegistry>>,
pub website_domains: Option<Arc<WebsiteDomainStore>>,
}
impl AppState {
pub fn new(config: ServerConfig) -> Self {
let storage = Arc::new(FsStorageBackend::new(config.storage_root.clone()));
let iam = Arc::new(IamService::new_with_secret(
config.iam_config_path.clone(),
config.secret_key.clone(),
));
let gc = if config.gc_enabled {
Some(Arc::new(GcService::new(
config.storage_root.clone(),
crate::services::gc::GcConfig::default(),
)))
} else {
None
};
let integrity = if config.integrity_enabled {
Some(Arc::new(IntegrityService::new(
storage.clone(),
&config.storage_root,
crate::services::integrity::IntegrityConfig::default(),
)))
} else {
None
};
let metrics = if config.metrics_enabled {
Some(Arc::new(MetricsService::new(
&config.storage_root,
crate::services::metrics::MetricsConfig::default(),
)))
} else {
None
};
let site_registry = Some(Arc::new(SiteRegistry::new(&config.storage_root)));
let website_domains = if config.website_hosting_enabled {
Some(Arc::new(WebsiteDomainStore::new(&config.storage_root)))
} else {
None
};
Self {
config,
storage,
iam,
encryption: None,
kms: None,
gc,
integrity,
metrics,
site_registry,
website_domains,
}
}
pub async fn new_with_encryption(config: ServerConfig) -> Self {
let mut state = Self::new(config.clone());
let keys_dir = config.storage_root.join(".myfsio.sys").join("keys");
let kms = if config.kms_enabled {
match KmsService::new(&keys_dir).await {
Ok(k) => Some(Arc::new(k)),
Err(e) => {
tracing::error!("Failed to initialize KMS: {}", e);
None
}
}
} else {
None
};
let encryption = if config.encryption_enabled {
match myfsio_crypto::kms::load_or_create_master_key(&keys_dir).await {
Ok(master_key) => {
Some(Arc::new(EncryptionService::new(master_key, kms.clone())))
}
Err(e) => {
tracing::error!("Failed to initialize encryption: {}", e);
None
}
}
} else {
None
};
state.encryption = encryption;
state.kms = kms;
state
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
[package]
name = "myfsio-storage"
version = "0.1.0"
edition = "2021"
[dependencies]
myfsio-common = { path = "../myfsio-common" }
myfsio-crypto = { path = "../myfsio-crypto" }
serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
dashmap = { workspace = true }
parking_lot = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }
regex = { workspace = true }
unicode-normalization = { workspace = true }
md-5 = { workspace = true }
sha2 = { workspace = true }
hex = { workspace = true }
[dev-dependencies]
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
tempfile = "3"

View File

@@ -0,0 +1,59 @@
use myfsio_common::error::{S3Error, S3ErrorCode};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum StorageError {
#[error("Bucket not found: {0}")]
BucketNotFound(String),
#[error("Bucket already exists: {0}")]
BucketAlreadyExists(String),
#[error("Bucket not empty: {0}")]
BucketNotEmpty(String),
#[error("Object not found: {bucket}/{key}")]
ObjectNotFound { bucket: String, key: String },
#[error("Invalid bucket name: {0}")]
InvalidBucketName(String),
#[error("Invalid object key: {0}")]
InvalidObjectKey(String),
#[error("Upload not found: {0}")]
UploadNotFound(String),
#[error("Quota exceeded: {0}")]
QuotaExceeded(String),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("JSON error: {0}")]
Json(#[from] serde_json::Error),
#[error("Internal error: {0}")]
Internal(String),
}
impl From<StorageError> for S3Error {
fn from(err: StorageError) -> Self {
match err {
StorageError::BucketNotFound(name) => {
S3Error::from_code(S3ErrorCode::NoSuchBucket).with_resource(format!("/{}", name))
}
StorageError::BucketAlreadyExists(name) => {
S3Error::from_code(S3ErrorCode::BucketAlreadyExists)
.with_resource(format!("/{}", name))
}
StorageError::BucketNotEmpty(name) => {
S3Error::from_code(S3ErrorCode::BucketNotEmpty)
.with_resource(format!("/{}", name))
}
StorageError::ObjectNotFound { bucket, key } => {
S3Error::from_code(S3ErrorCode::NoSuchKey)
.with_resource(format!("/{}/{}", bucket, key))
}
StorageError::InvalidBucketName(msg) => S3Error::new(S3ErrorCode::InvalidBucketName, msg),
StorageError::InvalidObjectKey(msg) => S3Error::new(S3ErrorCode::InvalidKey, msg),
StorageError::UploadNotFound(id) => {
S3Error::new(S3ErrorCode::NoSuchUpload, format!("Upload {} not found", id))
}
StorageError::QuotaExceeded(msg) => S3Error::new(S3ErrorCode::QuotaExceeded, msg),
StorageError::Io(e) => S3Error::new(S3ErrorCode::InternalError, e.to_string()),
StorageError::Json(e) => S3Error::new(S3ErrorCode::InternalError, e.to_string()),
StorageError::Internal(msg) => S3Error::new(S3ErrorCode::InternalError, msg),
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,4 @@
pub mod validation;
pub mod traits;
pub mod error;
pub mod fs_backend;

View File

@@ -0,0 +1,125 @@
use crate::error::StorageError;
use myfsio_common::types::*;
use std::collections::HashMap;
use std::path::PathBuf;
use std::pin::Pin;
use tokio::io::AsyncRead;
pub type StorageResult<T> = Result<T, StorageError>;
pub type AsyncReadStream = Pin<Box<dyn AsyncRead + Send>>;
#[allow(async_fn_in_trait)]
pub trait StorageEngine: Send + Sync {
async fn list_buckets(&self) -> StorageResult<Vec<BucketMeta>>;
async fn create_bucket(&self, name: &str) -> StorageResult<()>;
async fn delete_bucket(&self, name: &str) -> StorageResult<()>;
async fn bucket_exists(&self, name: &str) -> StorageResult<bool>;
async fn bucket_stats(&self, name: &str) -> StorageResult<BucketStats>;
async fn put_object(
&self,
bucket: &str,
key: &str,
stream: AsyncReadStream,
metadata: Option<HashMap<String, String>>,
) -> StorageResult<ObjectMeta>;
async fn get_object(&self, bucket: &str, key: &str) -> StorageResult<(ObjectMeta, AsyncReadStream)>;
async fn get_object_path(&self, bucket: &str, key: &str) -> StorageResult<PathBuf>;
async fn head_object(&self, bucket: &str, key: &str) -> StorageResult<ObjectMeta>;
async fn delete_object(&self, bucket: &str, key: &str) -> StorageResult<()>;
async fn copy_object(
&self,
src_bucket: &str,
src_key: &str,
dst_bucket: &str,
dst_key: &str,
) -> StorageResult<ObjectMeta>;
async fn get_object_metadata(
&self,
bucket: &str,
key: &str,
) -> StorageResult<HashMap<String, String>>;
async fn put_object_metadata(
&self,
bucket: &str,
key: &str,
metadata: &HashMap<String, String>,
) -> StorageResult<()>;
async fn list_objects(&self, bucket: &str, params: &ListParams) -> StorageResult<ListObjectsResult>;
async fn list_objects_shallow(
&self,
bucket: &str,
params: &ShallowListParams,
) -> StorageResult<ShallowListResult>;
async fn initiate_multipart(
&self,
bucket: &str,
key: &str,
metadata: Option<HashMap<String, String>>,
) -> StorageResult<String>;
async fn upload_part(
&self,
bucket: &str,
upload_id: &str,
part_number: u32,
stream: AsyncReadStream,
) -> StorageResult<String>;
async fn complete_multipart(
&self,
bucket: &str,
upload_id: &str,
parts: &[PartInfo],
) -> StorageResult<ObjectMeta>;
async fn abort_multipart(&self, bucket: &str, upload_id: &str) -> StorageResult<()>;
async fn list_parts(&self, bucket: &str, upload_id: &str) -> StorageResult<Vec<PartMeta>>;
async fn list_multipart_uploads(
&self,
bucket: &str,
) -> StorageResult<Vec<MultipartUploadInfo>>;
async fn get_bucket_config(&self, bucket: &str) -> StorageResult<BucketConfig>;
async fn set_bucket_config(&self, bucket: &str, config: &BucketConfig) -> StorageResult<()>;
async fn is_versioning_enabled(&self, bucket: &str) -> StorageResult<bool>;
async fn set_versioning(&self, bucket: &str, enabled: bool) -> StorageResult<()>;
async fn list_object_versions(
&self,
bucket: &str,
key: &str,
) -> StorageResult<Vec<VersionInfo>>;
async fn get_object_tags(
&self,
bucket: &str,
key: &str,
) -> StorageResult<Vec<Tag>>;
async fn set_object_tags(
&self,
bucket: &str,
key: &str,
tags: &[Tag],
) -> StorageResult<()>;
async fn delete_object_tags(
&self,
bucket: &str,
key: &str,
) -> StorageResult<()>;
}

View File

@@ -0,0 +1,194 @@
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;
const WINDOWS_RESERVED: &[&str] = &[
"CON", "PRN", "AUX", "NUL", "COM0", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7",
"COM8", "COM9", "LPT0", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8",
"LPT9",
];
const WINDOWS_ILLEGAL_CHARS: &[char] = &['<', '>', ':', '"', '/', '\\', '|', '?', '*'];
const INTERNAL_FOLDERS: &[&str] = &[".meta", ".versions", ".multipart"];
const SYSTEM_ROOT: &str = ".myfsio.sys";
static IP_REGEX: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").unwrap());
pub fn validate_object_key(
object_key: &str,
max_length_bytes: usize,
is_windows: bool,
reserved_prefixes: Option<&[&str]>,
) -> Option<String> {
if object_key.is_empty() {
return Some("Object key required".to_string());
}
if object_key.contains('\0') {
return Some("Object key contains null bytes".to_string());
}
let normalized: String = object_key.nfc().collect();
if normalized.len() > max_length_bytes {
return Some(format!(
"Object key exceeds maximum length of {} bytes",
max_length_bytes
));
}
if normalized.starts_with('/') || normalized.starts_with('\\') {
return Some("Object key cannot start with a slash".to_string());
}
let parts: Vec<&str> = if cfg!(windows) || is_windows {
normalized.split(['/', '\\']).collect()
} else {
normalized.split('/').collect()
};
for part in &parts {
if part.is_empty() {
continue;
}
if *part == ".." {
return Some("Object key contains parent directory references".to_string());
}
if *part == "." {
return Some("Object key contains invalid segments".to_string());
}
if part.chars().any(|c| (c as u32) < 32) {
return Some("Object key contains control characters".to_string());
}
if is_windows {
if part.chars().any(|c| WINDOWS_ILLEGAL_CHARS.contains(&c)) {
return Some(
"Object key contains characters not supported on Windows filesystems"
.to_string(),
);
}
if part.ends_with(' ') || part.ends_with('.') {
return Some(
"Object key segments cannot end with spaces or periods on Windows".to_string(),
);
}
let trimmed = part.trim_end_matches(['.', ' ']).to_uppercase();
if WINDOWS_RESERVED.contains(&trimmed.as_str()) {
return Some(format!("Invalid filename segment: {}", part));
}
}
}
let non_empty_parts: Vec<&str> = parts.iter().filter(|p| !p.is_empty()).copied().collect();
if let Some(top) = non_empty_parts.first() {
if INTERNAL_FOLDERS.contains(top) || *top == SYSTEM_ROOT {
return Some("Object key uses a reserved prefix".to_string());
}
if let Some(prefixes) = reserved_prefixes {
for prefix in prefixes {
if *top == *prefix {
return Some("Object key uses a reserved prefix".to_string());
}
}
}
}
None
}
pub fn validate_bucket_name(bucket_name: &str) -> Option<String> {
let len = bucket_name.len();
if len < 3 || len > 63 {
return Some("Bucket name must be between 3 and 63 characters".to_string());
}
let bytes = bucket_name.as_bytes();
if !bytes[0].is_ascii_lowercase() && !bytes[0].is_ascii_digit() {
return Some(
"Bucket name must start and end with a lowercase letter or digit".to_string(),
);
}
if !bytes[len - 1].is_ascii_lowercase() && !bytes[len - 1].is_ascii_digit() {
return Some(
"Bucket name must start and end with a lowercase letter or digit".to_string(),
);
}
for &b in bytes {
if !b.is_ascii_lowercase() && !b.is_ascii_digit() && b != b'.' && b != b'-' {
return Some(
"Bucket name can only contain lowercase letters, digits, dots, and hyphens"
.to_string(),
);
}
}
if bucket_name.contains("..") {
return Some("Bucket name must not contain consecutive periods".to_string());
}
if IP_REGEX.is_match(bucket_name) {
return Some("Bucket name must not be formatted as an IP address".to_string());
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_valid_bucket_names() {
assert!(validate_bucket_name("my-bucket").is_none());
assert!(validate_bucket_name("test123").is_none());
assert!(validate_bucket_name("my.bucket.name").is_none());
}
#[test]
fn test_invalid_bucket_names() {
assert!(validate_bucket_name("ab").is_some());
assert!(validate_bucket_name("My-Bucket").is_some());
assert!(validate_bucket_name("-bucket").is_some());
assert!(validate_bucket_name("bucket-").is_some());
assert!(validate_bucket_name("my..bucket").is_some());
assert!(validate_bucket_name("192.168.1.1").is_some());
}
#[test]
fn test_valid_object_keys() {
assert!(validate_object_key("file.txt", 1024, false, None).is_none());
assert!(validate_object_key("path/to/file.txt", 1024, false, None).is_none());
assert!(validate_object_key("a", 1024, false, None).is_none());
}
#[test]
fn test_invalid_object_keys() {
assert!(validate_object_key("", 1024, false, None).is_some());
assert!(validate_object_key("/leading-slash", 1024, false, None).is_some());
assert!(validate_object_key("path/../escape", 1024, false, None).is_some());
assert!(validate_object_key(".myfsio.sys/secret", 1024, false, None).is_some());
assert!(validate_object_key(".meta/data", 1024, false, None).is_some());
}
#[test]
fn test_object_key_max_length() {
let long_key = "a".repeat(1025);
assert!(validate_object_key(&long_key, 1024, false, None).is_some());
let ok_key = "a".repeat(1024);
assert!(validate_object_key(&ok_key, 1024, false, None).is_none());
}
#[test]
fn test_windows_validation() {
assert!(validate_object_key("CON", 1024, true, None).is_some());
assert!(validate_object_key("file<name", 1024, true, None).is_some());
assert!(validate_object_key("file.txt ", 1024, true, None).is_some());
}
}

View File

@@ -0,0 +1,10 @@
[package]
name = "myfsio-xml"
version = "0.1.0"
edition = "2021"
[dependencies]
myfsio-common = { path = "../myfsio-common" }
quick-xml = { workspace = true }
serde = { workspace = true }
chrono = { workspace = true }

View File

@@ -0,0 +1,14 @@
pub mod response;
pub mod request;
use quick_xml::Writer;
use std::io::Cursor;
pub fn write_xml_element(tag: &str, text: &str) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer
.create_element(tag)
.write_text_content(quick_xml::events::BytesText::new(text))
.unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}

View File

@@ -0,0 +1,159 @@
use quick_xml::events::Event;
use quick_xml::Reader;
#[derive(Debug, Default)]
pub struct DeleteObjectsRequest {
pub objects: Vec<ObjectIdentifier>,
pub quiet: bool,
}
#[derive(Debug)]
pub struct ObjectIdentifier {
pub key: String,
pub version_id: Option<String>,
}
#[derive(Debug, Default)]
pub struct CompleteMultipartUpload {
pub parts: Vec<CompletedPart>,
}
#[derive(Debug)]
pub struct CompletedPart {
pub part_number: u32,
pub etag: String,
}
pub fn parse_complete_multipart_upload(xml: &str) -> Result<CompleteMultipartUpload, String> {
let mut reader = Reader::from_str(xml);
let mut result = CompleteMultipartUpload::default();
let mut buf = Vec::new();
let mut current_tag = String::new();
let mut part_number: Option<u32> = None;
let mut etag: Option<String> = None;
let mut in_part = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
current_tag = name.clone();
if name == "Part" {
in_part = true;
part_number = None;
etag = None;
}
}
Ok(Event::Text(ref e)) => {
if in_part {
let text = e.unescape().map_err(|e| e.to_string())?.to_string();
match current_tag.as_str() {
"PartNumber" => {
part_number = Some(text.trim().parse().map_err(|e: std::num::ParseIntError| e.to_string())?);
}
"ETag" => {
etag = Some(text.trim().trim_matches('"').to_string());
}
_ => {}
}
}
}
Ok(Event::End(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if name == "Part" && in_part {
if let (Some(pn), Some(et)) = (part_number.take(), etag.take()) {
result.parts.push(CompletedPart {
part_number: pn,
etag: et,
});
}
in_part = false;
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(format!("XML parse error: {}", e)),
_ => {}
}
buf.clear();
}
result.parts.sort_by_key(|p| p.part_number);
Ok(result)
}
pub fn parse_delete_objects(xml: &str) -> Result<DeleteObjectsRequest, String> {
let mut reader = Reader::from_str(xml);
let mut result = DeleteObjectsRequest::default();
let mut buf = Vec::new();
let mut current_tag = String::new();
let mut current_key: Option<String> = None;
let mut current_version_id: Option<String> = None;
let mut in_object = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
current_tag = name.clone();
if name == "Object" {
in_object = true;
current_key = None;
current_version_id = None;
}
}
Ok(Event::Text(ref e)) => {
let text = e.unescape().map_err(|e| e.to_string())?.to_string();
match current_tag.as_str() {
"Key" if in_object => {
current_key = Some(text.trim().to_string());
}
"VersionId" if in_object => {
current_version_id = Some(text.trim().to_string());
}
"Quiet" => {
result.quiet = text.trim() == "true";
}
_ => {}
}
}
Ok(Event::End(ref e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if name == "Object" && in_object {
if let Some(key) = current_key.take() {
result.objects.push(ObjectIdentifier {
key,
version_id: current_version_id.take(),
});
}
in_object = false;
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(format!("XML parse error: {}", e)),
_ => {}
}
buf.clear();
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_complete_multipart() {
let xml = r#"<CompleteMultipartUpload>
<Part><PartNumber>2</PartNumber><ETag>"etag2"</ETag></Part>
<Part><PartNumber>1</PartNumber><ETag>"etag1"</ETag></Part>
</CompleteMultipartUpload>"#;
let result = parse_complete_multipart_upload(xml).unwrap();
assert_eq!(result.parts.len(), 2);
assert_eq!(result.parts[0].part_number, 1);
assert_eq!(result.parts[0].etag, "etag1");
assert_eq!(result.parts[1].part_number, 2);
assert_eq!(result.parts[1].etag, "etag2");
}
}

View File

@@ -0,0 +1,363 @@
use chrono::{DateTime, Utc};
use myfsio_common::types::{BucketMeta, ObjectMeta};
use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, BytesText, Event};
use quick_xml::Writer;
use std::io::Cursor;
pub fn format_s3_datetime(dt: &DateTime<Utc>) -> String {
dt.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string()
}
pub fn list_buckets_xml(owner_id: &str, owner_name: &str, buckets: &[BucketMeta]) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("ListAllMyBucketsResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
writer.write_event(Event::Start(BytesStart::new("Owner"))).unwrap();
write_text_element(&mut writer, "ID", owner_id);
write_text_element(&mut writer, "DisplayName", owner_name);
writer.write_event(Event::End(BytesEnd::new("Owner"))).unwrap();
writer.write_event(Event::Start(BytesStart::new("Buckets"))).unwrap();
for bucket in buckets {
writer.write_event(Event::Start(BytesStart::new("Bucket"))).unwrap();
write_text_element(&mut writer, "Name", &bucket.name);
write_text_element(&mut writer, "CreationDate", &format_s3_datetime(&bucket.creation_date));
writer.write_event(Event::End(BytesEnd::new("Bucket"))).unwrap();
}
writer.write_event(Event::End(BytesEnd::new("Buckets"))).unwrap();
writer.write_event(Event::End(BytesEnd::new("ListAllMyBucketsResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn list_objects_v2_xml(
bucket_name: &str,
prefix: &str,
delimiter: &str,
max_keys: usize,
objects: &[ObjectMeta],
common_prefixes: &[String],
is_truncated: bool,
continuation_token: Option<&str>,
next_continuation_token: Option<&str>,
key_count: usize,
) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("ListBucketResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "Name", bucket_name);
write_text_element(&mut writer, "Prefix", prefix);
if !delimiter.is_empty() {
write_text_element(&mut writer, "Delimiter", delimiter);
}
write_text_element(&mut writer, "MaxKeys", &max_keys.to_string());
write_text_element(&mut writer, "KeyCount", &key_count.to_string());
write_text_element(&mut writer, "IsTruncated", &is_truncated.to_string());
if let Some(token) = continuation_token {
write_text_element(&mut writer, "ContinuationToken", token);
}
if let Some(token) = next_continuation_token {
write_text_element(&mut writer, "NextContinuationToken", token);
}
for obj in objects {
writer.write_event(Event::Start(BytesStart::new("Contents"))).unwrap();
write_text_element(&mut writer, "Key", &obj.key);
write_text_element(&mut writer, "LastModified", &format_s3_datetime(&obj.last_modified));
if let Some(ref etag) = obj.etag {
write_text_element(&mut writer, "ETag", &format!("\"{}\"", etag));
}
write_text_element(&mut writer, "Size", &obj.size.to_string());
write_text_element(&mut writer, "StorageClass", obj.storage_class.as_deref().unwrap_or("STANDARD"));
writer.write_event(Event::End(BytesEnd::new("Contents"))).unwrap();
}
for prefix in common_prefixes {
writer.write_event(Event::Start(BytesStart::new("CommonPrefixes"))).unwrap();
write_text_element(&mut writer, "Prefix", prefix);
writer.write_event(Event::End(BytesEnd::new("CommonPrefixes"))).unwrap();
}
writer.write_event(Event::End(BytesEnd::new("ListBucketResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn list_objects_v1_xml(
bucket_name: &str,
prefix: &str,
marker: &str,
delimiter: &str,
max_keys: usize,
objects: &[ObjectMeta],
common_prefixes: &[String],
is_truncated: bool,
next_marker: Option<&str>,
) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer
.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None)))
.unwrap();
let start = BytesStart::new("ListBucketResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "Name", bucket_name);
write_text_element(&mut writer, "Prefix", prefix);
write_text_element(&mut writer, "Marker", marker);
write_text_element(&mut writer, "MaxKeys", &max_keys.to_string());
write_text_element(&mut writer, "IsTruncated", &is_truncated.to_string());
if !delimiter.is_empty() {
write_text_element(&mut writer, "Delimiter", delimiter);
}
if !delimiter.is_empty() && is_truncated {
if let Some(nm) = next_marker {
if !nm.is_empty() {
write_text_element(&mut writer, "NextMarker", nm);
}
}
}
for obj in objects {
writer
.write_event(Event::Start(BytesStart::new("Contents")))
.unwrap();
write_text_element(&mut writer, "Key", &obj.key);
write_text_element(&mut writer, "LastModified", &format_s3_datetime(&obj.last_modified));
if let Some(ref etag) = obj.etag {
write_text_element(&mut writer, "ETag", &format!("\"{}\"", etag));
}
write_text_element(&mut writer, "Size", &obj.size.to_string());
writer
.write_event(Event::End(BytesEnd::new("Contents")))
.unwrap();
}
for cp in common_prefixes {
writer
.write_event(Event::Start(BytesStart::new("CommonPrefixes")))
.unwrap();
write_text_element(&mut writer, "Prefix", cp);
writer
.write_event(Event::End(BytesEnd::new("CommonPrefixes")))
.unwrap();
}
writer
.write_event(Event::End(BytesEnd::new("ListBucketResult")))
.unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
fn write_text_element(writer: &mut Writer<Cursor<Vec<u8>>>, tag: &str, text: &str) {
writer.write_event(Event::Start(BytesStart::new(tag))).unwrap();
writer.write_event(Event::Text(BytesText::new(text))).unwrap();
writer.write_event(Event::End(BytesEnd::new(tag))).unwrap();
}
pub fn initiate_multipart_upload_xml(bucket: &str, key: &str, upload_id: &str) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("InitiateMultipartUploadResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "Bucket", bucket);
write_text_element(&mut writer, "Key", key);
write_text_element(&mut writer, "UploadId", upload_id);
writer.write_event(Event::End(BytesEnd::new("InitiateMultipartUploadResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn complete_multipart_upload_xml(
bucket: &str,
key: &str,
etag: &str,
location: &str,
) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("CompleteMultipartUploadResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "Location", location);
write_text_element(&mut writer, "Bucket", bucket);
write_text_element(&mut writer, "Key", key);
write_text_element(&mut writer, "ETag", &format!("\"{}\"", etag));
writer.write_event(Event::End(BytesEnd::new("CompleteMultipartUploadResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn copy_object_result_xml(etag: &str, last_modified: &str) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("CopyObjectResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "ETag", &format!("\"{}\"", etag));
write_text_element(&mut writer, "LastModified", last_modified);
writer.write_event(Event::End(BytesEnd::new("CopyObjectResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn delete_result_xml(
deleted: &[(String, Option<String>)],
errors: &[(String, String, String)],
quiet: bool,
) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("DeleteResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
if !quiet {
for (key, version_id) in deleted {
writer.write_event(Event::Start(BytesStart::new("Deleted"))).unwrap();
write_text_element(&mut writer, "Key", key);
if let Some(vid) = version_id {
write_text_element(&mut writer, "VersionId", vid);
}
writer.write_event(Event::End(BytesEnd::new("Deleted"))).unwrap();
}
}
for (key, code, message) in errors {
writer.write_event(Event::Start(BytesStart::new("Error"))).unwrap();
write_text_element(&mut writer, "Key", key);
write_text_element(&mut writer, "Code", code);
write_text_element(&mut writer, "Message", message);
writer.write_event(Event::End(BytesEnd::new("Error"))).unwrap();
}
writer.write_event(Event::End(BytesEnd::new("DeleteResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn list_multipart_uploads_xml(
bucket: &str,
uploads: &[myfsio_common::types::MultipartUploadInfo],
) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("ListMultipartUploadsResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "Bucket", bucket);
for upload in uploads {
writer.write_event(Event::Start(BytesStart::new("Upload"))).unwrap();
write_text_element(&mut writer, "Key", &upload.key);
write_text_element(&mut writer, "UploadId", &upload.upload_id);
write_text_element(&mut writer, "Initiated", &format_s3_datetime(&upload.initiated));
writer.write_event(Event::End(BytesEnd::new("Upload"))).unwrap();
}
writer.write_event(Event::End(BytesEnd::new("ListMultipartUploadsResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
pub fn list_parts_xml(
bucket: &str,
key: &str,
upload_id: &str,
parts: &[myfsio_common::types::PartMeta],
) -> String {
let mut writer = Writer::new(Cursor::new(Vec::new()));
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), None))).unwrap();
let start = BytesStart::new("ListPartsResult")
.with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]);
writer.write_event(Event::Start(start)).unwrap();
write_text_element(&mut writer, "Bucket", bucket);
write_text_element(&mut writer, "Key", key);
write_text_element(&mut writer, "UploadId", upload_id);
for part in parts {
writer.write_event(Event::Start(BytesStart::new("Part"))).unwrap();
write_text_element(&mut writer, "PartNumber", &part.part_number.to_string());
write_text_element(&mut writer, "ETag", &format!("\"{}\"", part.etag));
write_text_element(&mut writer, "Size", &part.size.to_string());
if let Some(ref lm) = part.last_modified {
write_text_element(&mut writer, "LastModified", &format_s3_datetime(lm));
}
writer.write_event(Event::End(BytesEnd::new("Part"))).unwrap();
}
writer.write_event(Event::End(BytesEnd::new("ListPartsResult"))).unwrap();
String::from_utf8(writer.into_inner().into_inner()).unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Utc;
#[test]
fn test_list_buckets_xml() {
let buckets = vec![BucketMeta {
name: "test-bucket".to_string(),
creation_date: Utc::now(),
}];
let xml = list_buckets_xml("owner-id", "owner-name", &buckets);
assert!(xml.contains("<Name>test-bucket</Name>"));
assert!(xml.contains("<ID>owner-id</ID>"));
assert!(xml.contains("ListAllMyBucketsResult"));
}
#[test]
fn test_list_objects_v2_xml() {
let objects = vec![ObjectMeta::new("file.txt".to_string(), 1024, Utc::now())];
let xml = list_objects_v2_xml(
"my-bucket", "", "/", 1000, &objects, &[], false, None, None, 1,
);
assert!(xml.contains("<Key>file.txt</Key>"));
assert!(xml.contains("<Size>1024</Size>"));
assert!(xml.contains("<IsTruncated>false</IsTruncated>"));
}
#[test]
fn test_list_objects_v1_xml() {
let objects = vec![ObjectMeta::new("file.txt".to_string(), 1024, Utc::now())];
let xml = list_objects_v1_xml(
"my-bucket",
"",
"",
"/",
1000,
&objects,
&[],
false,
None,
);
assert!(xml.contains("<Key>file.txt</Key>"));
assert!(xml.contains("<Size>1024</Size>"));
assert!(xml.contains("<Marker></Marker>"));
}
}

24
myfsio_core/Cargo.toml Normal file
View File

@@ -0,0 +1,24 @@
[package]
name = "myfsio_core"
version = "0.1.0"
edition = "2021"
[lib]
name = "myfsio_core"
crate-type = ["cdylib"]
[dependencies]
pyo3 = { version = "0.28", features = ["extension-module"] }
hmac = "0.12"
sha2 = "0.10"
md-5 = "0.10"
hex = "0.4"
unicode-normalization = "0.1"
serde_json = "1"
regex = "1"
lru = "0.14"
parking_lot = "0.12"
percent-encoding = "2"
aes-gcm = "0.10"
hkdf = "0.12"
uuid = { version = "1", features = ["v4"] }

View File

@@ -0,0 +1,11 @@
[build-system]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"
[project]
name = "myfsio_core"
version = "0.1.0"
requires-python = ">=3.10"
[tool.maturin]
features = ["pyo3/extension-module"]

192
myfsio_core/src/crypto.rs Normal file
View File

@@ -0,0 +1,192 @@
use aes_gcm::aead::Aead;
use aes_gcm::{Aes256Gcm, KeyInit, Nonce};
use hkdf::Hkdf;
use pyo3::exceptions::{PyIOError, PyValueError};
use pyo3::prelude::*;
use sha2::Sha256;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom, Write};
const DEFAULT_CHUNK_SIZE: usize = 65536;
const HEADER_SIZE: usize = 4;
fn read_exact_chunk(reader: &mut impl Read, buf: &mut [u8]) -> std::io::Result<usize> {
let mut filled = 0;
while filled < buf.len() {
match reader.read(&mut buf[filled..]) {
Ok(0) => break,
Ok(n) => filled += n,
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
Ok(filled)
}
fn derive_chunk_nonce(base_nonce: &[u8], chunk_index: u32) -> Result<[u8; 12], String> {
let hkdf = Hkdf::<Sha256>::new(Some(base_nonce), b"chunk_nonce");
let mut okm = [0u8; 12];
hkdf.expand(&chunk_index.to_be_bytes(), &mut okm)
.map_err(|e| format!("HKDF expand failed: {}", e))?;
Ok(okm)
}
#[pyfunction]
#[pyo3(signature = (input_path, output_path, key, base_nonce, chunk_size=DEFAULT_CHUNK_SIZE))]
pub fn encrypt_stream_chunked(
py: Python<'_>,
input_path: &str,
output_path: &str,
key: &[u8],
base_nonce: &[u8],
chunk_size: usize,
) -> PyResult<u32> {
if key.len() != 32 {
return Err(PyValueError::new_err(format!(
"Key must be 32 bytes, got {}",
key.len()
)));
}
if base_nonce.len() != 12 {
return Err(PyValueError::new_err(format!(
"Base nonce must be 12 bytes, got {}",
base_nonce.len()
)));
}
let chunk_size = if chunk_size == 0 {
DEFAULT_CHUNK_SIZE
} else {
chunk_size
};
let inp = input_path.to_owned();
let out = output_path.to_owned();
let key_arr: [u8; 32] = key.try_into().unwrap();
let nonce_arr: [u8; 12] = base_nonce.try_into().unwrap();
py.detach(move || {
let cipher = Aes256Gcm::new(&key_arr.into());
let mut infile = File::open(&inp)
.map_err(|e| PyIOError::new_err(format!("Failed to open input: {}", e)))?;
let mut outfile = File::create(&out)
.map_err(|e| PyIOError::new_err(format!("Failed to create output: {}", e)))?;
outfile
.write_all(&[0u8; 4])
.map_err(|e| PyIOError::new_err(format!("Failed to write header: {}", e)))?;
let mut buf = vec![0u8; chunk_size];
let mut chunk_index: u32 = 0;
loop {
let n = read_exact_chunk(&mut infile, &mut buf)
.map_err(|e| PyIOError::new_err(format!("Failed to read: {}", e)))?;
if n == 0 {
break;
}
let nonce_bytes = derive_chunk_nonce(&nonce_arr, chunk_index)
.map_err(|e| PyValueError::new_err(e))?;
let nonce = Nonce::from_slice(&nonce_bytes);
let encrypted = cipher
.encrypt(nonce, &buf[..n])
.map_err(|e| PyValueError::new_err(format!("Encrypt failed: {}", e)))?;
let size = encrypted.len() as u32;
outfile
.write_all(&size.to_be_bytes())
.map_err(|e| PyIOError::new_err(format!("Failed to write chunk size: {}", e)))?;
outfile
.write_all(&encrypted)
.map_err(|e| PyIOError::new_err(format!("Failed to write chunk: {}", e)))?;
chunk_index += 1;
}
outfile
.seek(SeekFrom::Start(0))
.map_err(|e| PyIOError::new_err(format!("Failed to seek: {}", e)))?;
outfile
.write_all(&chunk_index.to_be_bytes())
.map_err(|e| PyIOError::new_err(format!("Failed to write chunk count: {}", e)))?;
Ok(chunk_index)
})
}
#[pyfunction]
pub fn decrypt_stream_chunked(
py: Python<'_>,
input_path: &str,
output_path: &str,
key: &[u8],
base_nonce: &[u8],
) -> PyResult<u32> {
if key.len() != 32 {
return Err(PyValueError::new_err(format!(
"Key must be 32 bytes, got {}",
key.len()
)));
}
if base_nonce.len() != 12 {
return Err(PyValueError::new_err(format!(
"Base nonce must be 12 bytes, got {}",
base_nonce.len()
)));
}
let inp = input_path.to_owned();
let out = output_path.to_owned();
let key_arr: [u8; 32] = key.try_into().unwrap();
let nonce_arr: [u8; 12] = base_nonce.try_into().unwrap();
py.detach(move || {
let cipher = Aes256Gcm::new(&key_arr.into());
let mut infile = File::open(&inp)
.map_err(|e| PyIOError::new_err(format!("Failed to open input: {}", e)))?;
let mut outfile = File::create(&out)
.map_err(|e| PyIOError::new_err(format!("Failed to create output: {}", e)))?;
let mut header = [0u8; HEADER_SIZE];
infile
.read_exact(&mut header)
.map_err(|e| PyIOError::new_err(format!("Failed to read header: {}", e)))?;
let chunk_count = u32::from_be_bytes(header);
let mut size_buf = [0u8; HEADER_SIZE];
for chunk_index in 0..chunk_count {
infile
.read_exact(&mut size_buf)
.map_err(|e| {
PyIOError::new_err(format!(
"Failed to read chunk {} size: {}",
chunk_index, e
))
})?;
let chunk_size = u32::from_be_bytes(size_buf) as usize;
let mut encrypted = vec![0u8; chunk_size];
infile.read_exact(&mut encrypted).map_err(|e| {
PyIOError::new_err(format!("Failed to read chunk {}: {}", chunk_index, e))
})?;
let nonce_bytes = derive_chunk_nonce(&nonce_arr, chunk_index)
.map_err(|e| PyValueError::new_err(e))?;
let nonce = Nonce::from_slice(&nonce_bytes);
let decrypted = cipher.decrypt(nonce, encrypted.as_ref()).map_err(|e| {
PyValueError::new_err(format!("Decrypt chunk {} failed: {}", chunk_index, e))
})?;
outfile.write_all(&decrypted).map_err(|e| {
PyIOError::new_err(format!("Failed to write chunk {}: {}", chunk_index, e))
})?;
}
Ok(chunk_count)
})
}

View File

@@ -0,0 +1,90 @@
use md5::{Digest, Md5};
use pyo3::exceptions::PyIOError;
use pyo3::prelude::*;
use sha2::Sha256;
use std::fs::File;
use std::io::Read;
const CHUNK_SIZE: usize = 65536;
#[pyfunction]
pub fn md5_file(py: Python<'_>, path: &str) -> PyResult<String> {
let path = path.to_owned();
py.detach(move || {
let mut file = File::open(&path)
.map_err(|e| PyIOError::new_err(format!("Failed to open file: {}", e)))?;
let mut hasher = Md5::new();
let mut buf = vec![0u8; CHUNK_SIZE];
loop {
let n = file
.read(&mut buf)
.map_err(|e| PyIOError::new_err(format!("Failed to read file: {}", e)))?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(format!("{:x}", hasher.finalize()))
})
}
#[pyfunction]
pub fn md5_bytes(data: &[u8]) -> String {
let mut hasher = Md5::new();
hasher.update(data);
format!("{:x}", hasher.finalize())
}
#[pyfunction]
pub fn sha256_file(py: Python<'_>, path: &str) -> PyResult<String> {
let path = path.to_owned();
py.detach(move || {
let mut file = File::open(&path)
.map_err(|e| PyIOError::new_err(format!("Failed to open file: {}", e)))?;
let mut hasher = Sha256::new();
let mut buf = vec![0u8; CHUNK_SIZE];
loop {
let n = file
.read(&mut buf)
.map_err(|e| PyIOError::new_err(format!("Failed to read file: {}", e)))?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(format!("{:x}", hasher.finalize()))
})
}
#[pyfunction]
pub fn sha256_bytes(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
format!("{:x}", hasher.finalize())
}
#[pyfunction]
pub fn md5_sha256_file(py: Python<'_>, path: &str) -> PyResult<(String, String)> {
let path = path.to_owned();
py.detach(move || {
let mut file = File::open(&path)
.map_err(|e| PyIOError::new_err(format!("Failed to open file: {}", e)))?;
let mut md5_hasher = Md5::new();
let mut sha_hasher = Sha256::new();
let mut buf = vec![0u8; CHUNK_SIZE];
loop {
let n = file
.read(&mut buf)
.map_err(|e| PyIOError::new_err(format!("Failed to read file: {}", e)))?;
if n == 0 {
break;
}
md5_hasher.update(&buf[..n]);
sha_hasher.update(&buf[..n]);
}
Ok((
format!("{:x}", md5_hasher.finalize()),
format!("{:x}", sha_hasher.finalize()),
))
})
}

51
myfsio_core/src/lib.rs Normal file
View File

@@ -0,0 +1,51 @@
mod crypto;
mod hashing;
mod metadata;
mod sigv4;
mod storage;
mod streaming;
mod validation;
use pyo3::prelude::*;
#[pymodule]
mod myfsio_core {
use super::*;
#[pymodule_init]
fn init(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(sigv4::verify_sigv4_signature, m)?)?;
m.add_function(wrap_pyfunction!(sigv4::derive_signing_key, m)?)?;
m.add_function(wrap_pyfunction!(sigv4::compute_signature, m)?)?;
m.add_function(wrap_pyfunction!(sigv4::build_string_to_sign, m)?)?;
m.add_function(wrap_pyfunction!(sigv4::constant_time_compare, m)?)?;
m.add_function(wrap_pyfunction!(sigv4::clear_signing_key_cache, m)?)?;
m.add_function(wrap_pyfunction!(hashing::md5_file, m)?)?;
m.add_function(wrap_pyfunction!(hashing::md5_bytes, m)?)?;
m.add_function(wrap_pyfunction!(hashing::sha256_file, m)?)?;
m.add_function(wrap_pyfunction!(hashing::sha256_bytes, m)?)?;
m.add_function(wrap_pyfunction!(hashing::md5_sha256_file, m)?)?;
m.add_function(wrap_pyfunction!(validation::validate_object_key, m)?)?;
m.add_function(wrap_pyfunction!(validation::validate_bucket_name, m)?)?;
m.add_function(wrap_pyfunction!(metadata::read_index_entry, m)?)?;
m.add_function(wrap_pyfunction!(storage::write_index_entry, m)?)?;
m.add_function(wrap_pyfunction!(storage::delete_index_entry, m)?)?;
m.add_function(wrap_pyfunction!(storage::check_bucket_contents, m)?)?;
m.add_function(wrap_pyfunction!(storage::shallow_scan, m)?)?;
m.add_function(wrap_pyfunction!(storage::bucket_stats_scan, m)?)?;
m.add_function(wrap_pyfunction!(storage::search_objects_scan, m)?)?;
m.add_function(wrap_pyfunction!(storage::build_object_cache, m)?)?;
m.add_function(wrap_pyfunction!(streaming::stream_to_file_with_md5, m)?)?;
m.add_function(wrap_pyfunction!(streaming::assemble_parts_with_md5, m)?)?;
m.add_function(wrap_pyfunction!(crypto::encrypt_stream_chunked, m)?)?;
m.add_function(wrap_pyfunction!(crypto::decrypt_stream_chunked, m)?)?;
Ok(())
}
}

View File

@@ -0,0 +1,71 @@
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList, PyString};
use serde_json::Value;
use std::fs;
const MAX_DEPTH: u32 = 64;
fn value_to_py(py: Python<'_>, v: &Value, depth: u32) -> PyResult<Py<PyAny>> {
if depth > MAX_DEPTH {
return Err(PyValueError::new_err("JSON nesting too deep"));
}
match v {
Value::Null => Ok(py.None()),
Value::Bool(b) => Ok((*b).into_pyobject(py)?.to_owned().into_any().unbind()),
Value::Number(n) => {
if let Some(i) = n.as_i64() {
Ok(i.into_pyobject(py)?.into_any().unbind())
} else if let Some(f) = n.as_f64() {
Ok(f.into_pyobject(py)?.into_any().unbind())
} else {
Ok(py.None())
}
}
Value::String(s) => Ok(PyString::new(py, s).into_any().unbind()),
Value::Array(arr) => {
let list = PyList::empty(py);
for item in arr {
list.append(value_to_py(py, item, depth + 1)?)?;
}
Ok(list.into_any().unbind())
}
Value::Object(map) => {
let dict = PyDict::new(py);
for (k, val) in map {
dict.set_item(k, value_to_py(py, val, depth + 1)?)?;
}
Ok(dict.into_any().unbind())
}
}
}
#[pyfunction]
pub fn read_index_entry(
py: Python<'_>,
path: &str,
entry_name: &str,
) -> PyResult<Option<Py<PyAny>>> {
let path_owned = path.to_owned();
let entry_owned = entry_name.to_owned();
let entry: Option<Value> = py.detach(move || -> PyResult<Option<Value>> {
let content = match fs::read_to_string(&path_owned) {
Ok(c) => c,
Err(_) => return Ok(None),
};
let parsed: Value = match serde_json::from_str(&content) {
Ok(v) => v,
Err(_) => return Ok(None),
};
match parsed {
Value::Object(mut map) => Ok(map.remove(&entry_owned)),
_ => Ok(None),
}
})?;
match entry {
Some(val) => Ok(Some(value_to_py(py, &val, 0)?)),
None => Ok(None),
}
}

193
myfsio_core/src/sigv4.rs Normal file
View File

@@ -0,0 +1,193 @@
use hmac::{Hmac, Mac};
use lru::LruCache;
use parking_lot::Mutex;
use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC};
use pyo3::prelude::*;
use sha2::{Digest, Sha256};
use std::num::NonZeroUsize;
use std::sync::LazyLock;
use std::time::Instant;
type HmacSha256 = Hmac<Sha256>;
struct CacheEntry {
key: Vec<u8>,
created: Instant,
}
static SIGNING_KEY_CACHE: LazyLock<Mutex<LruCache<(String, String, String, String), CacheEntry>>> =
LazyLock::new(|| Mutex::new(LruCache::new(NonZeroUsize::new(256).unwrap())));
const CACHE_TTL_SECS: u64 = 60;
const AWS_ENCODE_SET: &AsciiSet = &NON_ALPHANUMERIC
.remove(b'-')
.remove(b'_')
.remove(b'.')
.remove(b'~');
fn hmac_sha256(key: &[u8], msg: &[u8]) -> Vec<u8> {
let mut mac = HmacSha256::new_from_slice(key).expect("HMAC key length is always valid");
mac.update(msg);
mac.finalize().into_bytes().to_vec()
}
fn sha256_hex(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
hex::encode(hasher.finalize())
}
fn aws_uri_encode(input: &str) -> String {
percent_encode(input.as_bytes(), AWS_ENCODE_SET).to_string()
}
fn derive_signing_key_cached(
secret_key: &str,
date_stamp: &str,
region: &str,
service: &str,
) -> Vec<u8> {
let cache_key = (
secret_key.to_owned(),
date_stamp.to_owned(),
region.to_owned(),
service.to_owned(),
);
{
let mut cache = SIGNING_KEY_CACHE.lock();
if let Some(entry) = cache.get(&cache_key) {
if entry.created.elapsed().as_secs() < CACHE_TTL_SECS {
return entry.key.clone();
}
cache.pop(&cache_key);
}
}
let k_date = hmac_sha256(format!("AWS4{}", secret_key).as_bytes(), date_stamp.as_bytes());
let k_region = hmac_sha256(&k_date, region.as_bytes());
let k_service = hmac_sha256(&k_region, service.as_bytes());
let k_signing = hmac_sha256(&k_service, b"aws4_request");
{
let mut cache = SIGNING_KEY_CACHE.lock();
cache.put(
cache_key,
CacheEntry {
key: k_signing.clone(),
created: Instant::now(),
},
);
}
k_signing
}
fn constant_time_compare_inner(a: &[u8], b: &[u8]) -> bool {
if a.len() != b.len() {
return false;
}
let mut result: u8 = 0;
for (x, y) in a.iter().zip(b.iter()) {
result |= x ^ y;
}
result == 0
}
#[pyfunction]
pub fn verify_sigv4_signature(
method: &str,
canonical_uri: &str,
query_params: Vec<(String, String)>,
signed_headers_str: &str,
header_values: Vec<(String, String)>,
payload_hash: &str,
amz_date: &str,
date_stamp: &str,
region: &str,
service: &str,
secret_key: &str,
provided_signature: &str,
) -> bool {
let mut sorted_params = query_params;
sorted_params.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)));
let canonical_query_string = sorted_params
.iter()
.map(|(k, v)| format!("{}={}", aws_uri_encode(k), aws_uri_encode(v)))
.collect::<Vec<_>>()
.join("&");
let mut canonical_headers = String::new();
for (name, value) in &header_values {
let lower_name = name.to_lowercase();
let normalized = value.split_whitespace().collect::<Vec<_>>().join(" ");
let final_value = if lower_name == "expect" && normalized.is_empty() {
"100-continue"
} else {
&normalized
};
canonical_headers.push_str(&lower_name);
canonical_headers.push(':');
canonical_headers.push_str(final_value);
canonical_headers.push('\n');
}
let canonical_request = format!(
"{}\n{}\n{}\n{}\n{}\n{}",
method, canonical_uri, canonical_query_string, canonical_headers, signed_headers_str, payload_hash
);
let credential_scope = format!("{}/{}/{}/aws4_request", date_stamp, region, service);
let cr_hash = sha256_hex(canonical_request.as_bytes());
let string_to_sign = format!(
"AWS4-HMAC-SHA256\n{}\n{}\n{}",
amz_date, credential_scope, cr_hash
);
let signing_key = derive_signing_key_cached(secret_key, date_stamp, region, service);
let calculated = hmac_sha256(&signing_key, string_to_sign.as_bytes());
let calculated_hex = hex::encode(&calculated);
constant_time_compare_inner(calculated_hex.as_bytes(), provided_signature.as_bytes())
}
#[pyfunction]
pub fn derive_signing_key(
secret_key: &str,
date_stamp: &str,
region: &str,
service: &str,
) -> Vec<u8> {
derive_signing_key_cached(secret_key, date_stamp, region, service)
}
#[pyfunction]
pub fn compute_signature(signing_key: &[u8], string_to_sign: &str) -> String {
let sig = hmac_sha256(signing_key, string_to_sign.as_bytes());
hex::encode(sig)
}
#[pyfunction]
pub fn build_string_to_sign(
amz_date: &str,
credential_scope: &str,
canonical_request: &str,
) -> String {
let cr_hash = sha256_hex(canonical_request.as_bytes());
format!(
"AWS4-HMAC-SHA256\n{}\n{}\n{}",
amz_date, credential_scope, cr_hash
)
}
#[pyfunction]
pub fn constant_time_compare(a: &str, b: &str) -> bool {
constant_time_compare_inner(a.as_bytes(), b.as_bytes())
}
#[pyfunction]
pub fn clear_signing_key_cache() {
SIGNING_KEY_CACHE.lock().clear();
}

817
myfsio_core/src/storage.rs Normal file
View File

@@ -0,0 +1,817 @@
use pyo3::exceptions::PyIOError;
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList, PyString, PyTuple};
use serde_json::Value;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::time::SystemTime;
const INTERNAL_FOLDERS: &[&str] = &[".meta", ".versions", ".multipart"];
fn system_time_to_epoch(t: SystemTime) -> f64 {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.unwrap_or(0.0)
}
fn extract_etag_from_meta_bytes(content: &[u8]) -> Option<String> {
let marker = b"\"__etag__\"";
let idx = content.windows(marker.len()).position(|w| w == marker)?;
let after = &content[idx + marker.len()..];
let start = after.iter().position(|&b| b == b'"')? + 1;
let rest = &after[start..];
let end = rest.iter().position(|&b| b == b'"')?;
std::str::from_utf8(&rest[..end]).ok().map(|s| s.to_owned())
}
fn has_any_file(root: &str) -> bool {
let root_path = Path::new(root);
if !root_path.is_dir() {
return false;
}
let mut stack = vec![root_path.to_path_buf()];
while let Some(current) = stack.pop() {
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_file() {
return true;
}
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
}
}
}
false
}
#[pyfunction]
pub fn write_index_entry(
py: Python<'_>,
path: &str,
entry_name: &str,
entry_data_json: &str,
) -> PyResult<()> {
let path_owned = path.to_owned();
let entry_owned = entry_name.to_owned();
let data_owned = entry_data_json.to_owned();
py.detach(move || -> PyResult<()> {
let entry_value: Value = serde_json::from_str(&data_owned)
.map_err(|e| PyIOError::new_err(format!("Failed to parse entry data: {}", e)))?;
if let Some(parent) = Path::new(&path_owned).parent() {
let _ = fs::create_dir_all(parent);
}
let mut index_data: serde_json::Map<String, Value> = match fs::read_to_string(&path_owned)
{
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
Err(_) => serde_json::Map::new(),
};
index_data.insert(entry_owned, entry_value);
let serialized = serde_json::to_string(&Value::Object(index_data))
.map_err(|e| PyIOError::new_err(format!("Failed to serialize index: {}", e)))?;
fs::write(&path_owned, serialized)
.map_err(|e| PyIOError::new_err(format!("Failed to write index: {}", e)))?;
Ok(())
})
}
#[pyfunction]
pub fn delete_index_entry(py: Python<'_>, path: &str, entry_name: &str) -> PyResult<bool> {
let path_owned = path.to_owned();
let entry_owned = entry_name.to_owned();
py.detach(move || -> PyResult<bool> {
let content = match fs::read_to_string(&path_owned) {
Ok(c) => c,
Err(_) => return Ok(false),
};
let mut index_data: serde_json::Map<String, Value> =
match serde_json::from_str(&content) {
Ok(v) => v,
Err(_) => return Ok(false),
};
if index_data.remove(&entry_owned).is_none() {
return Ok(false);
}
if index_data.is_empty() {
let _ = fs::remove_file(&path_owned);
return Ok(true);
}
let serialized = serde_json::to_string(&Value::Object(index_data))
.map_err(|e| PyIOError::new_err(format!("Failed to serialize index: {}", e)))?;
fs::write(&path_owned, serialized)
.map_err(|e| PyIOError::new_err(format!("Failed to write index: {}", e)))?;
Ok(true)
})
}
#[pyfunction]
pub fn check_bucket_contents(
py: Python<'_>,
bucket_path: &str,
version_roots: Vec<String>,
multipart_roots: Vec<String>,
) -> PyResult<(bool, bool, bool)> {
let bucket_owned = bucket_path.to_owned();
py.detach(move || -> PyResult<(bool, bool, bool)> {
let mut has_objects = false;
let bucket_p = Path::new(&bucket_owned);
if bucket_p.is_dir() {
let mut stack = vec![bucket_p.to_path_buf()];
'obj_scan: while let Some(current) = stack.pop() {
let is_root = current == bucket_p;
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if is_root {
if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
}
if ft.is_file() && !ft.is_symlink() {
has_objects = true;
break 'obj_scan;
}
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
}
}
}
}
let mut has_versions = false;
for root in &version_roots {
if has_versions {
break;
}
has_versions = has_any_file(root);
}
let mut has_multipart = false;
for root in &multipart_roots {
if has_multipart {
break;
}
has_multipart = has_any_file(root);
}
Ok((has_objects, has_versions, has_multipart))
})
}
#[pyfunction]
pub fn shallow_scan(
py: Python<'_>,
target_dir: &str,
prefix: &str,
meta_cache_json: &str,
) -> PyResult<Py<PyAny>> {
let target_owned = target_dir.to_owned();
let prefix_owned = prefix.to_owned();
let cache_owned = meta_cache_json.to_owned();
let result: (
Vec<(String, u64, f64, Option<String>)>,
Vec<String>,
Vec<(String, bool)>,
) = py.detach(move || -> PyResult<(
Vec<(String, u64, f64, Option<String>)>,
Vec<String>,
Vec<(String, bool)>,
)> {
let meta_cache: HashMap<String, String> =
serde_json::from_str(&cache_owned).unwrap_or_default();
let mut files: Vec<(String, u64, f64, Option<String>)> = Vec::new();
let mut dirs: Vec<String> = Vec::new();
let entries = match fs::read_dir(&target_owned) {
Ok(e) => e,
Err(_) => return Ok((files, dirs, Vec::new())),
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let name = match entry.file_name().into_string() {
Ok(n) => n,
Err(_) => continue,
};
if INTERNAL_FOLDERS.contains(&name.as_str()) {
continue;
}
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
let cp = format!("{}{}/", prefix_owned, name);
dirs.push(cp);
} else if ft.is_file() && !ft.is_symlink() {
let key = format!("{}{}", prefix_owned, name);
let md = match entry.metadata() {
Ok(m) => m,
Err(_) => continue,
};
let size = md.len();
let mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
let etag = meta_cache.get(&key).cloned();
files.push((key, size, mtime, etag));
}
}
files.sort_by(|a, b| a.0.cmp(&b.0));
dirs.sort();
let mut merged: Vec<(String, bool)> = Vec::with_capacity(files.len() + dirs.len());
let mut fi = 0;
let mut di = 0;
while fi < files.len() && di < dirs.len() {
if files[fi].0 <= dirs[di] {
merged.push((files[fi].0.clone(), false));
fi += 1;
} else {
merged.push((dirs[di].clone(), true));
di += 1;
}
}
while fi < files.len() {
merged.push((files[fi].0.clone(), false));
fi += 1;
}
while di < dirs.len() {
merged.push((dirs[di].clone(), true));
di += 1;
}
Ok((files, dirs, merged))
})?;
let (files, dirs, merged) = result;
let dict = PyDict::new(py);
let files_list = PyList::empty(py);
for (key, size, mtime, etag) in &files {
let etag_py: Py<PyAny> = match etag {
Some(e) => PyString::new(py, e).into_any().unbind(),
None => py.None(),
};
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
size.into_pyobject(py)?.into_any().unbind(),
mtime.into_pyobject(py)?.into_any().unbind(),
etag_py,
])?;
files_list.append(tuple)?;
}
dict.set_item("files", files_list)?;
let dirs_list = PyList::empty(py);
for d in &dirs {
dirs_list.append(PyString::new(py, d))?;
}
dict.set_item("dirs", dirs_list)?;
let merged_list = PyList::empty(py);
for (key, is_dir) in &merged {
let bool_obj: Py<PyAny> = if *is_dir {
true.into_pyobject(py)?.to_owned().into_any().unbind()
} else {
false.into_pyobject(py)?.to_owned().into_any().unbind()
};
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
bool_obj,
])?;
merged_list.append(tuple)?;
}
dict.set_item("merged_keys", merged_list)?;
Ok(dict.into_any().unbind())
}
#[pyfunction]
pub fn bucket_stats_scan(
py: Python<'_>,
bucket_path: &str,
versions_root: &str,
) -> PyResult<(u64, u64, u64, u64)> {
let bucket_owned = bucket_path.to_owned();
let versions_owned = versions_root.to_owned();
py.detach(move || -> PyResult<(u64, u64, u64, u64)> {
let mut object_count: u64 = 0;
let mut total_bytes: u64 = 0;
let bucket_p = Path::new(&bucket_owned);
if bucket_p.is_dir() {
let mut stack = vec![bucket_p.to_path_buf()];
while let Some(current) = stack.pop() {
let is_root = current == bucket_p;
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
if is_root {
if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
}
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
} else if ft.is_file() && !ft.is_symlink() {
object_count += 1;
if let Ok(md) = entry.metadata() {
total_bytes += md.len();
}
}
}
}
}
let mut version_count: u64 = 0;
let mut version_bytes: u64 = 0;
let versions_p = Path::new(&versions_owned);
if versions_p.is_dir() {
let mut stack = vec![versions_p.to_path_buf()];
while let Some(current) = stack.pop() {
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
} else if ft.is_file() && !ft.is_symlink() {
if let Some(name) = entry.file_name().to_str() {
if name.ends_with(".bin") {
version_count += 1;
if let Ok(md) = entry.metadata() {
version_bytes += md.len();
}
}
}
}
}
}
}
Ok((object_count, total_bytes, version_count, version_bytes))
})
}
#[pyfunction]
#[pyo3(signature = (bucket_path, search_root, query, limit))]
pub fn search_objects_scan(
py: Python<'_>,
bucket_path: &str,
search_root: &str,
query: &str,
limit: usize,
) -> PyResult<Py<PyAny>> {
let bucket_owned = bucket_path.to_owned();
let search_owned = search_root.to_owned();
let query_owned = query.to_owned();
let result: (Vec<(String, u64, f64)>, bool) = py.detach(
move || -> PyResult<(Vec<(String, u64, f64)>, bool)> {
let query_lower = query_owned.to_lowercase();
let bucket_len = bucket_owned.len() + 1;
let scan_limit = limit * 4;
let mut matched: usize = 0;
let mut results: Vec<(String, u64, f64)> = Vec::new();
let search_p = Path::new(&search_owned);
if !search_p.is_dir() {
return Ok((results, false));
}
let bucket_p = Path::new(&bucket_owned);
let mut stack = vec![search_p.to_path_buf()];
'scan: while let Some(current) = stack.pop() {
let is_bucket_root = current == bucket_p;
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
if is_bucket_root {
if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
}
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
stack.push(entry.path());
} else if ft.is_file() && !ft.is_symlink() {
let full_path = entry.path();
let full_str = full_path.to_string_lossy();
if full_str.len() <= bucket_len {
continue;
}
let key = full_str[bucket_len..].replace('\\', "/");
if key.to_lowercase().contains(&query_lower) {
if let Ok(md) = entry.metadata() {
let size = md.len();
let mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
results.push((key, size, mtime));
matched += 1;
}
}
if matched >= scan_limit {
break 'scan;
}
}
}
}
results.sort_by(|a, b| a.0.cmp(&b.0));
let truncated = results.len() > limit;
results.truncate(limit);
Ok((results, truncated))
},
)?;
let (results, truncated) = result;
let dict = PyDict::new(py);
let results_list = PyList::empty(py);
for (key, size, mtime) in &results {
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
size.into_pyobject(py)?.into_any().unbind(),
mtime.into_pyobject(py)?.into_any().unbind(),
])?;
results_list.append(tuple)?;
}
dict.set_item("results", results_list)?;
dict.set_item("truncated", truncated)?;
Ok(dict.into_any().unbind())
}
#[pyfunction]
pub fn build_object_cache(
py: Python<'_>,
bucket_path: &str,
meta_root: &str,
etag_index_path: &str,
) -> PyResult<Py<PyAny>> {
let bucket_owned = bucket_path.to_owned();
let meta_owned = meta_root.to_owned();
let index_path_owned = etag_index_path.to_owned();
let result: (HashMap<String, String>, Vec<(String, u64, f64, Option<String>)>, bool) =
py.detach(move || -> PyResult<(
HashMap<String, String>,
Vec<(String, u64, f64, Option<String>)>,
bool,
)> {
let mut meta_cache: HashMap<String, String> = HashMap::new();
let mut index_mtime: f64 = 0.0;
let mut etag_cache_changed = false;
let index_p = Path::new(&index_path_owned);
if index_p.is_file() {
if let Ok(md) = fs::metadata(&index_path_owned) {
index_mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
}
if let Ok(content) = fs::read_to_string(&index_path_owned) {
if let Ok(parsed) = serde_json::from_str::<HashMap<String, String>>(&content) {
meta_cache = parsed;
}
}
}
let meta_p = Path::new(&meta_owned);
let mut needs_rebuild = false;
if meta_p.is_dir() && index_mtime > 0.0 {
fn check_newer(dir: &Path, index_mtime: f64) -> bool {
let entries = match fs::read_dir(dir) {
Ok(e) => e,
Err(_) => return false,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
if check_newer(&entry.path(), index_mtime) {
return true;
}
} else if ft.is_file() {
if let Some(name) = entry.file_name().to_str() {
if name.ends_with(".meta.json") || name == "_index.json" {
if let Ok(md) = entry.metadata() {
let mt = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
if mt > index_mtime {
return true;
}
}
}
}
}
}
false
}
needs_rebuild = check_newer(meta_p, index_mtime);
} else if meta_cache.is_empty() {
needs_rebuild = true;
}
if needs_rebuild && meta_p.is_dir() {
let meta_str = meta_owned.clone();
let meta_len = meta_str.len() + 1;
let mut index_files: Vec<String> = Vec::new();
let mut legacy_meta_files: Vec<(String, String)> = Vec::new();
fn collect_meta(
dir: &Path,
meta_len: usize,
index_files: &mut Vec<String>,
legacy_meta_files: &mut Vec<(String, String)>,
) {
let entries = match fs::read_dir(dir) {
Ok(e) => e,
Err(_) => return,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
collect_meta(&entry.path(), meta_len, index_files, legacy_meta_files);
} else if ft.is_file() {
if let Some(name) = entry.file_name().to_str() {
let full = entry.path().to_string_lossy().to_string();
if name == "_index.json" {
index_files.push(full);
} else if name.ends_with(".meta.json") {
if full.len() > meta_len {
let rel = &full[meta_len..];
let key = if rel.len() > 10 {
rel[..rel.len() - 10].replace('\\', "/")
} else {
continue;
};
legacy_meta_files.push((key, full));
}
}
}
}
}
}
collect_meta(
meta_p,
meta_len,
&mut index_files,
&mut legacy_meta_files,
);
meta_cache.clear();
for idx_path in &index_files {
if let Ok(content) = fs::read_to_string(idx_path) {
if let Ok(idx_data) = serde_json::from_str::<HashMap<String, Value>>(&content) {
let rel_dir = if idx_path.len() > meta_len {
let r = &idx_path[meta_len..];
r.replace('\\', "/")
} else {
String::new()
};
let dir_prefix = if rel_dir.ends_with("/_index.json") {
&rel_dir[..rel_dir.len() - "/_index.json".len()]
} else {
""
};
for (entry_name, entry_data) in &idx_data {
let key = if dir_prefix.is_empty() {
entry_name.clone()
} else {
format!("{}/{}", dir_prefix, entry_name)
};
if let Some(meta_obj) = entry_data.get("metadata") {
if let Some(etag) = meta_obj.get("__etag__") {
if let Some(etag_str) = etag.as_str() {
meta_cache.insert(key, etag_str.to_owned());
}
}
}
}
}
}
}
for (key, path) in &legacy_meta_files {
if meta_cache.contains_key(key) {
continue;
}
if let Ok(content) = fs::read(path) {
if let Some(etag) = extract_etag_from_meta_bytes(&content) {
meta_cache.insert(key.clone(), etag);
}
}
}
etag_cache_changed = true;
}
let bucket_p = Path::new(&bucket_owned);
let bucket_len = bucket_owned.len() + 1;
let mut objects: Vec<(String, u64, f64, Option<String>)> = Vec::new();
if bucket_p.is_dir() {
let mut stack = vec![bucket_p.to_path_buf()];
while let Some(current) = stack.pop() {
let entries = match fs::read_dir(&current) {
Ok(e) => e,
Err(_) => continue,
};
for entry_result in entries {
let entry = match entry_result {
Ok(e) => e,
Err(_) => continue,
};
let ft = match entry.file_type() {
Ok(ft) => ft,
Err(_) => continue,
};
if ft.is_dir() && !ft.is_symlink() {
let full = entry.path();
let full_str = full.to_string_lossy();
if full_str.len() > bucket_len {
let first_part: &str = if let Some(sep_pos) =
full_str[bucket_len..].find(|c: char| c == '\\' || c == '/')
{
&full_str[bucket_len..bucket_len + sep_pos]
} else {
&full_str[bucket_len..]
};
if INTERNAL_FOLDERS.contains(&first_part) {
continue;
}
} else if let Some(name) = entry.file_name().to_str() {
if INTERNAL_FOLDERS.contains(&name) {
continue;
}
}
stack.push(full);
} else if ft.is_file() && !ft.is_symlink() {
let full = entry.path();
let full_str = full.to_string_lossy();
if full_str.len() <= bucket_len {
continue;
}
let rel = &full_str[bucket_len..];
let first_part: &str =
if let Some(sep_pos) = rel.find(|c: char| c == '\\' || c == '/') {
&rel[..sep_pos]
} else {
rel
};
if INTERNAL_FOLDERS.contains(&first_part) {
continue;
}
let key = rel.replace('\\', "/");
if let Ok(md) = entry.metadata() {
let size = md.len();
let mtime = md
.modified()
.map(system_time_to_epoch)
.unwrap_or(0.0);
let etag = meta_cache.get(&key).cloned();
objects.push((key, size, mtime, etag));
}
}
}
}
}
Ok((meta_cache, objects, etag_cache_changed))
})?;
let (meta_cache, objects, etag_cache_changed) = result;
let dict = PyDict::new(py);
let cache_dict = PyDict::new(py);
for (k, v) in &meta_cache {
cache_dict.set_item(k, v)?;
}
dict.set_item("etag_cache", cache_dict)?;
let objects_list = PyList::empty(py);
for (key, size, mtime, etag) in &objects {
let etag_py: Py<PyAny> = match etag {
Some(e) => PyString::new(py, e).into_any().unbind(),
None => py.None(),
};
let tuple = PyTuple::new(py, &[
PyString::new(py, key).into_any().unbind(),
size.into_pyobject(py)?.into_any().unbind(),
mtime.into_pyobject(py)?.into_any().unbind(),
etag_py,
])?;
objects_list.append(tuple)?;
}
dict.set_item("objects", objects_list)?;
dict.set_item("etag_cache_changed", etag_cache_changed)?;
Ok(dict.into_any().unbind())
}

View File

@@ -0,0 +1,112 @@
use md5::{Digest, Md5};
use pyo3::exceptions::{PyIOError, PyValueError};
use pyo3::prelude::*;
use std::fs::{self, File};
use std::io::{Read, Write};
use uuid::Uuid;
const DEFAULT_CHUNK_SIZE: usize = 262144;
#[pyfunction]
#[pyo3(signature = (stream, tmp_dir, chunk_size=DEFAULT_CHUNK_SIZE))]
pub fn stream_to_file_with_md5(
py: Python<'_>,
stream: &Bound<'_, PyAny>,
tmp_dir: &str,
chunk_size: usize,
) -> PyResult<(String, String, u64)> {
let chunk_size = if chunk_size == 0 {
DEFAULT_CHUNK_SIZE
} else {
chunk_size
};
fs::create_dir_all(tmp_dir)
.map_err(|e| PyIOError::new_err(format!("Failed to create tmp dir: {}", e)))?;
let tmp_name = format!("{}.tmp", Uuid::new_v4().as_hyphenated());
let tmp_path_buf = std::path::PathBuf::from(tmp_dir).join(&tmp_name);
let tmp_path = tmp_path_buf.to_string_lossy().into_owned();
let mut file = File::create(&tmp_path)
.map_err(|e| PyIOError::new_err(format!("Failed to create temp file: {}", e)))?;
let mut hasher = Md5::new();
let mut total_bytes: u64 = 0;
let result: PyResult<()> = (|| {
loop {
let chunk: Vec<u8> = stream.call_method1("read", (chunk_size,))?.extract()?;
if chunk.is_empty() {
break;
}
hasher.update(&chunk);
file.write_all(&chunk)
.map_err(|e| PyIOError::new_err(format!("Failed to write: {}", e)))?;
total_bytes += chunk.len() as u64;
py.check_signals()?;
}
file.sync_all()
.map_err(|e| PyIOError::new_err(format!("Failed to fsync: {}", e)))?;
Ok(())
})();
if let Err(e) = result {
drop(file);
let _ = fs::remove_file(&tmp_path);
return Err(e);
}
drop(file);
let md5_hex = format!("{:x}", hasher.finalize());
Ok((tmp_path, md5_hex, total_bytes))
}
#[pyfunction]
pub fn assemble_parts_with_md5(
py: Python<'_>,
part_paths: Vec<String>,
dest_path: &str,
) -> PyResult<String> {
if part_paths.is_empty() {
return Err(PyValueError::new_err("No parts to assemble"));
}
let dest = dest_path.to_owned();
let parts = part_paths;
py.detach(move || {
if let Some(parent) = std::path::Path::new(&dest).parent() {
fs::create_dir_all(parent)
.map_err(|e| PyIOError::new_err(format!("Failed to create dest dir: {}", e)))?;
}
let mut target = File::create(&dest)
.map_err(|e| PyIOError::new_err(format!("Failed to create dest file: {}", e)))?;
let mut hasher = Md5::new();
let mut buf = vec![0u8; 1024 * 1024];
for part_path in &parts {
let mut part = File::open(part_path)
.map_err(|e| PyIOError::new_err(format!("Failed to open part {}: {}", part_path, e)))?;
loop {
let n = part
.read(&mut buf)
.map_err(|e| PyIOError::new_err(format!("Failed to read part: {}", e)))?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
target
.write_all(&buf[..n])
.map_err(|e| PyIOError::new_err(format!("Failed to write: {}", e)))?;
}
}
target.sync_all()
.map_err(|e| PyIOError::new_err(format!("Failed to fsync: {}", e)))?;
Ok(format!("{:x}", hasher.finalize()))
})
}

View File

@@ -0,0 +1,149 @@
use pyo3::prelude::*;
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;
const WINDOWS_RESERVED: &[&str] = &[
"CON", "PRN", "AUX", "NUL", "COM0", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7",
"COM8", "COM9", "LPT0", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8",
"LPT9",
];
const WINDOWS_ILLEGAL_CHARS: &[char] = &['<', '>', ':', '"', '/', '\\', '|', '?', '*'];
const INTERNAL_FOLDERS: &[&str] = &[".meta", ".versions", ".multipart"];
const SYSTEM_ROOT: &str = ".myfsio.sys";
static IP_REGEX: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").unwrap());
#[pyfunction]
#[pyo3(signature = (object_key, max_length_bytes=1024, is_windows=false, reserved_prefixes=None))]
pub fn validate_object_key(
object_key: &str,
max_length_bytes: usize,
is_windows: bool,
reserved_prefixes: Option<Vec<String>>,
) -> PyResult<Option<String>> {
if object_key.is_empty() {
return Ok(Some("Object key required".to_string()));
}
if object_key.contains('\0') {
return Ok(Some("Object key contains null bytes".to_string()));
}
let normalized: String = object_key.nfc().collect();
if normalized.len() > max_length_bytes {
return Ok(Some(format!(
"Object key exceeds maximum length of {} bytes",
max_length_bytes
)));
}
if normalized.starts_with('/') || normalized.starts_with('\\') {
return Ok(Some("Object key cannot start with a slash".to_string()));
}
let parts: Vec<&str> = if cfg!(windows) || is_windows {
normalized.split(['/', '\\']).collect()
} else {
normalized.split('/').collect()
};
for part in &parts {
if part.is_empty() {
continue;
}
if *part == ".." {
return Ok(Some(
"Object key contains parent directory references".to_string(),
));
}
if *part == "." {
return Ok(Some("Object key contains invalid segments".to_string()));
}
if part.chars().any(|c| (c as u32) < 32) {
return Ok(Some(
"Object key contains control characters".to_string(),
));
}
if is_windows {
if part.chars().any(|c| WINDOWS_ILLEGAL_CHARS.contains(&c)) {
return Ok(Some(
"Object key contains characters not supported on Windows filesystems"
.to_string(),
));
}
if part.ends_with(' ') || part.ends_with('.') {
return Ok(Some(
"Object key segments cannot end with spaces or periods on Windows".to_string(),
));
}
let trimmed = part.trim_end_matches(['.', ' ']).to_uppercase();
if WINDOWS_RESERVED.contains(&trimmed.as_str()) {
return Ok(Some(format!("Invalid filename segment: {}", part)));
}
}
}
let non_empty_parts: Vec<&str> = parts.iter().filter(|p| !p.is_empty()).copied().collect();
if let Some(top) = non_empty_parts.first() {
if INTERNAL_FOLDERS.contains(top) || *top == SYSTEM_ROOT {
return Ok(Some("Object key uses a reserved prefix".to_string()));
}
if let Some(ref prefixes) = reserved_prefixes {
for prefix in prefixes {
if *top == prefix.as_str() {
return Ok(Some("Object key uses a reserved prefix".to_string()));
}
}
}
}
Ok(None)
}
#[pyfunction]
pub fn validate_bucket_name(bucket_name: &str) -> Option<String> {
let len = bucket_name.len();
if len < 3 || len > 63 {
return Some("Bucket name must be between 3 and 63 characters".to_string());
}
let bytes = bucket_name.as_bytes();
if !bytes[0].is_ascii_lowercase() && !bytes[0].is_ascii_digit() {
return Some(
"Bucket name must start and end with a lowercase letter or digit".to_string(),
);
}
if !bytes[len - 1].is_ascii_lowercase() && !bytes[len - 1].is_ascii_digit() {
return Some(
"Bucket name must start and end with a lowercase letter or digit".to_string(),
);
}
for &b in bytes {
if !b.is_ascii_lowercase() && !b.is_ascii_digit() && b != b'.' && b != b'-' {
return Some(
"Bucket name can only contain lowercase letters, digits, dots, and hyphens"
.to_string(),
);
}
}
if bucket_name.contains("..") {
return Some("Bucket name must not contain consecutive periods".to_string());
}
if IP_REGEX.is_match(bucket_name) {
return Some("Bucket name must not be formatted as an IP address".to_string());
}
None
}

5
pytest.ini Normal file
View File

@@ -0,0 +1,5 @@
[pytest]
testpaths = tests
norecursedirs = data .git __pycache__ .venv
markers =
integration: marks tests as integration tests (may require external services)

Some files were not shown because too many files have changed in this diff Show More