Add operation metrics with logging integration in metrics UI

This commit is contained in:
2026-01-18 23:50:47 +08:00
parent 4adfcc4131
commit adb9017580
7 changed files with 1008 additions and 2 deletions

View File

@@ -16,6 +16,7 @@ from flask_wtf.csrf import CSRFError
from werkzeug.middleware.proxy_fix import ProxyFix
from .access_logging import AccessLoggingService
from .operation_metrics import OperationMetricsCollector, classify_endpoint
from .compression import GzipMiddleware
from .acl import AclService
from .bucket_policies import BucketPolicyStore
@@ -187,6 +188,15 @@ def create_app(
app.extensions["notifications"] = notification_service
app.extensions["access_logging"] = access_logging_service
operation_metrics_collector = None
if app.config.get("OPERATION_METRICS_ENABLED", False):
operation_metrics_collector = OperationMetricsCollector(
storage_root,
interval_minutes=app.config.get("OPERATION_METRICS_INTERVAL_MINUTES", 5),
retention_hours=app.config.get("OPERATION_METRICS_RETENTION_HOURS", 24),
)
app.extensions["operation_metrics"] = operation_metrics_collector
@app.errorhandler(500)
def internal_error(error):
return render_template('500.html'), 500
@@ -356,6 +366,7 @@ def _configure_logging(app: Flask) -> None:
def _log_request_start() -> None:
g.request_id = uuid.uuid4().hex
g.request_started_at = time.perf_counter()
g.request_bytes_in = request.content_length or 0
app.logger.info(
"Request started",
extra={"path": request.path, "method": request.method, "remote_addr": request.remote_addr},
@@ -377,4 +388,21 @@ def _configure_logging(app: Flask) -> None:
},
)
response.headers["X-Request-Duration-ms"] = f"{duration_ms:.2f}"
operation_metrics = app.extensions.get("operation_metrics")
if operation_metrics:
bytes_in = getattr(g, "request_bytes_in", 0)
bytes_out = response.content_length or 0
error_code = getattr(g, "s3_error_code", None)
endpoint_type = classify_endpoint(request.path)
operation_metrics.record_request(
method=request.method,
endpoint_type=endpoint_type,
status_code=response.status_code,
latency_ms=duration_ms,
bytes_in=bytes_in,
bytes_out=bytes_out,
error_code=error_code,
)
return response

View File

@@ -87,6 +87,9 @@ class AppConfig:
metrics_history_enabled: bool
metrics_history_retention_hours: int
metrics_history_interval_minutes: int
operation_metrics_enabled: bool
operation_metrics_interval_minutes: int
operation_metrics_retention_hours: int
@classmethod
def from_env(cls, overrides: Optional[Dict[str, Any]] = None) -> "AppConfig":
@@ -186,6 +189,9 @@ class AppConfig:
metrics_history_enabled = str(_get("METRICS_HISTORY_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
metrics_history_retention_hours = int(_get("METRICS_HISTORY_RETENTION_HOURS", 24))
metrics_history_interval_minutes = int(_get("METRICS_HISTORY_INTERVAL_MINUTES", 5))
operation_metrics_enabled = str(_get("OPERATION_METRICS_ENABLED", "0")).lower() in {"1", "true", "yes", "on"}
operation_metrics_interval_minutes = int(_get("OPERATION_METRICS_INTERVAL_MINUTES", 5))
operation_metrics_retention_hours = int(_get("OPERATION_METRICS_RETENTION_HOURS", 24))
return cls(storage_root=storage_root,
max_upload_size=max_upload_size,
@@ -227,7 +233,10 @@ class AppConfig:
lifecycle_interval_seconds=lifecycle_interval_seconds,
metrics_history_enabled=metrics_history_enabled,
metrics_history_retention_hours=metrics_history_retention_hours,
metrics_history_interval_minutes=metrics_history_interval_minutes)
metrics_history_interval_minutes=metrics_history_interval_minutes,
operation_metrics_enabled=operation_metrics_enabled,
operation_metrics_interval_minutes=operation_metrics_interval_minutes,
operation_metrics_retention_hours=operation_metrics_retention_hours)
def validate_and_report(self) -> list[str]:
"""Validate configuration and return a list of warnings/issues.
@@ -359,4 +368,7 @@ class AppConfig:
"METRICS_HISTORY_ENABLED": self.metrics_history_enabled,
"METRICS_HISTORY_RETENTION_HOURS": self.metrics_history_retention_hours,
"METRICS_HISTORY_INTERVAL_MINUTES": self.metrics_history_interval_minutes,
"OPERATION_METRICS_ENABLED": self.operation_metrics_enabled,
"OPERATION_METRICS_INTERVAL_MINUTES": self.operation_metrics_interval_minutes,
"OPERATION_METRICS_RETENTION_HOURS": self.operation_metrics_retention_hours,
}

271
app/operation_metrics.py Normal file
View File

@@ -0,0 +1,271 @@
from __future__ import annotations
import json
import logging
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class OperationStats:
count: int = 0
success_count: int = 0
error_count: int = 0
latency_sum_ms: float = 0.0
latency_min_ms: float = float("inf")
latency_max_ms: float = 0.0
bytes_in: int = 0
bytes_out: int = 0
def record(self, latency_ms: float, success: bool, bytes_in: int = 0, bytes_out: int = 0) -> None:
self.count += 1
if success:
self.success_count += 1
else:
self.error_count += 1
self.latency_sum_ms += latency_ms
if latency_ms < self.latency_min_ms:
self.latency_min_ms = latency_ms
if latency_ms > self.latency_max_ms:
self.latency_max_ms = latency_ms
self.bytes_in += bytes_in
self.bytes_out += bytes_out
def to_dict(self) -> Dict[str, Any]:
avg_latency = self.latency_sum_ms / self.count if self.count > 0 else 0.0
min_latency = self.latency_min_ms if self.latency_min_ms != float("inf") else 0.0
return {
"count": self.count,
"success_count": self.success_count,
"error_count": self.error_count,
"latency_avg_ms": round(avg_latency, 2),
"latency_min_ms": round(min_latency, 2),
"latency_max_ms": round(self.latency_max_ms, 2),
"bytes_in": self.bytes_in,
"bytes_out": self.bytes_out,
}
def merge(self, other: "OperationStats") -> None:
self.count += other.count
self.success_count += other.success_count
self.error_count += other.error_count
self.latency_sum_ms += other.latency_sum_ms
if other.latency_min_ms < self.latency_min_ms:
self.latency_min_ms = other.latency_min_ms
if other.latency_max_ms > self.latency_max_ms:
self.latency_max_ms = other.latency_max_ms
self.bytes_in += other.bytes_in
self.bytes_out += other.bytes_out
@dataclass
class MetricsSnapshot:
timestamp: datetime
window_seconds: int
by_method: Dict[str, Dict[str, Any]]
by_endpoint: Dict[str, Dict[str, Any]]
by_status_class: Dict[str, int]
error_codes: Dict[str, int]
totals: Dict[str, Any]
def to_dict(self) -> Dict[str, Any]:
return {
"timestamp": self.timestamp.isoformat(),
"window_seconds": self.window_seconds,
"by_method": self.by_method,
"by_endpoint": self.by_endpoint,
"by_status_class": self.by_status_class,
"error_codes": self.error_codes,
"totals": self.totals,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MetricsSnapshot":
return cls(
timestamp=datetime.fromisoformat(data["timestamp"]),
window_seconds=data.get("window_seconds", 300),
by_method=data.get("by_method", {}),
by_endpoint=data.get("by_endpoint", {}),
by_status_class=data.get("by_status_class", {}),
error_codes=data.get("error_codes", {}),
totals=data.get("totals", {}),
)
class OperationMetricsCollector:
def __init__(
self,
storage_root: Path,
interval_minutes: int = 5,
retention_hours: int = 24,
):
self.storage_root = storage_root
self.interval_seconds = interval_minutes * 60
self.retention_hours = retention_hours
self._lock = threading.Lock()
self._by_method: Dict[str, OperationStats] = {}
self._by_endpoint: Dict[str, OperationStats] = {}
self._by_status_class: Dict[str, int] = {}
self._error_codes: Dict[str, int] = {}
self._totals = OperationStats()
self._window_start = time.time()
self._shutdown = threading.Event()
self._snapshots: List[MetricsSnapshot] = []
self._load_history()
self._snapshot_thread = threading.Thread(
target=self._snapshot_loop, name="operation-metrics-snapshot", daemon=True
)
self._snapshot_thread.start()
def _config_path(self) -> Path:
return self.storage_root / ".myfsio.sys" / "config" / "operation_metrics.json"
def _load_history(self) -> None:
config_path = self._config_path()
if not config_path.exists():
return
try:
data = json.loads(config_path.read_text(encoding="utf-8"))
snapshots_data = data.get("snapshots", [])
self._snapshots = [MetricsSnapshot.from_dict(s) for s in snapshots_data]
self._prune_old_snapshots()
except (json.JSONDecodeError, OSError, KeyError) as e:
logger.warning(f"Failed to load operation metrics history: {e}")
def _save_history(self) -> None:
config_path = self._config_path()
config_path.parent.mkdir(parents=True, exist_ok=True)
try:
data = {"snapshots": [s.to_dict() for s in self._snapshots]}
config_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
except OSError as e:
logger.warning(f"Failed to save operation metrics history: {e}")
def _prune_old_snapshots(self) -> None:
if not self._snapshots:
return
cutoff = datetime.now(timezone.utc).timestamp() - (self.retention_hours * 3600)
self._snapshots = [
s for s in self._snapshots if s.timestamp.timestamp() > cutoff
]
def _snapshot_loop(self) -> None:
while not self._shutdown.is_set():
self._shutdown.wait(timeout=self.interval_seconds)
if not self._shutdown.is_set():
self._take_snapshot()
def _take_snapshot(self) -> None:
with self._lock:
now = datetime.now(timezone.utc)
window_seconds = int(time.time() - self._window_start)
snapshot = MetricsSnapshot(
timestamp=now,
window_seconds=window_seconds,
by_method={k: v.to_dict() for k, v in self._by_method.items()},
by_endpoint={k: v.to_dict() for k, v in self._by_endpoint.items()},
by_status_class=dict(self._by_status_class),
error_codes=dict(self._error_codes),
totals=self._totals.to_dict(),
)
self._snapshots.append(snapshot)
self._prune_old_snapshots()
self._save_history()
self._by_method.clear()
self._by_endpoint.clear()
self._by_status_class.clear()
self._error_codes.clear()
self._totals = OperationStats()
self._window_start = time.time()
def record_request(
self,
method: str,
endpoint_type: str,
status_code: int,
latency_ms: float,
bytes_in: int = 0,
bytes_out: int = 0,
error_code: Optional[str] = None,
) -> None:
success = 200 <= status_code < 400
status_class = f"{status_code // 100}xx"
with self._lock:
if method not in self._by_method:
self._by_method[method] = OperationStats()
self._by_method[method].record(latency_ms, success, bytes_in, bytes_out)
if endpoint_type not in self._by_endpoint:
self._by_endpoint[endpoint_type] = OperationStats()
self._by_endpoint[endpoint_type].record(latency_ms, success, bytes_in, bytes_out)
self._by_status_class[status_class] = self._by_status_class.get(status_class, 0) + 1
if error_code:
self._error_codes[error_code] = self._error_codes.get(error_code, 0) + 1
self._totals.record(latency_ms, success, bytes_in, bytes_out)
def get_current_stats(self) -> Dict[str, Any]:
with self._lock:
window_seconds = int(time.time() - self._window_start)
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"window_seconds": window_seconds,
"by_method": {k: v.to_dict() for k, v in self._by_method.items()},
"by_endpoint": {k: v.to_dict() for k, v in self._by_endpoint.items()},
"by_status_class": dict(self._by_status_class),
"error_codes": dict(self._error_codes),
"totals": self._totals.to_dict(),
}
def get_history(self, hours: Optional[int] = None) -> List[Dict[str, Any]]:
with self._lock:
snapshots = list(self._snapshots)
if hours:
cutoff = datetime.now(timezone.utc).timestamp() - (hours * 3600)
snapshots = [s for s in snapshots if s.timestamp.timestamp() > cutoff]
return [s.to_dict() for s in snapshots]
def shutdown(self) -> None:
self._shutdown.set()
self._take_snapshot()
self._snapshot_thread.join(timeout=5.0)
def classify_endpoint(path: str) -> str:
if not path or path == "/":
return "service"
path = path.rstrip("/")
if path.startswith("/ui"):
return "ui"
if path.startswith("/kms"):
return "kms"
if path.startswith("/myfsio"):
return "service"
parts = path.lstrip("/").split("/")
if len(parts) == 0:
return "service"
elif len(parts) == 1:
return "bucket"
else:
return "object"

View File

@@ -88,6 +88,7 @@ def _xml_response(element: Element, status: int = 200) -> Response:
def _error_response(code: str, message: str, status: int) -> Response:
g.s3_error_code = code
error = Element("Error")
SubElement(error, "Code").text = code
SubElement(error, "Message").text = message

View File

@@ -141,6 +141,10 @@ def _acl() -> AclService:
return current_app.extensions["acl"]
def _operation_metrics():
return current_app.extensions.get("operation_metrics")
def _format_bytes(num: int) -> str:
step = 1024
units = ["B", "KB", "MB", "GB", "TB", "PB"]
@@ -2196,6 +2200,7 @@ def metrics_dashboard():
"uptime_days": uptime_days,
},
metrics_history_enabled=current_app.config.get("METRICS_HISTORY_ENABLED", False),
operation_metrics_enabled=current_app.config.get("OPERATION_METRICS_ENABLED", False),
)
@@ -2329,6 +2334,52 @@ def metrics_settings():
})
@ui_bp.get("/metrics/operations")
def metrics_operations():
principal = _current_principal()
try:
_iam().authorize(principal, None, "iam:list_users")
except IamError:
return jsonify({"error": "Access denied"}), 403
collector = _operation_metrics()
if not collector:
return jsonify({
"enabled": False,
"stats": None,
})
return jsonify({
"enabled": True,
"stats": collector.get_current_stats(),
})
@ui_bp.get("/metrics/operations/history")
def metrics_operations_history():
principal = _current_principal()
try:
_iam().authorize(principal, None, "iam:list_users")
except IamError:
return jsonify({"error": "Access denied"}), 403
collector = _operation_metrics()
if not collector:
return jsonify({
"enabled": False,
"history": [],
})
hours = request.args.get("hours", type=int)
return jsonify({
"enabled": True,
"history": collector.get_history(hours),
"interval_minutes": current_app.config.get("OPERATION_METRICS_INTERVAL_MINUTES", 5),
})
@ui_bp.route("/buckets/<bucket_name>/lifecycle", methods=["GET", "POST", "DELETE"])
def bucket_lifecycle(bucket_name: str):
principal = _current_principal()

View File

@@ -268,6 +268,121 @@
</div>
</div>
{% if operation_metrics_enabled %}
<div class="row g-4 mt-2">
<div class="col-12">
<div class="card shadow-sm border-0">
<div class="card-header bg-transparent border-0 pt-4 px-4 d-flex justify-content-between align-items-center">
<h5 class="card-title mb-0 fw-semibold">API Operations</h5>
<div class="d-flex align-items-center gap-3">
<span class="small text-muted" id="opStatus">Loading...</span>
<button class="btn btn-outline-secondary btn-sm" id="resetOpMetricsBtn" title="Reset current window">
<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14" fill="currentColor" class="bi bi-arrow-counterclockwise" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M8 3a5 5 0 1 1-4.546 2.914.5.5 0 0 0-.908-.417A6 6 0 1 0 8 2v1z"/>
<path d="M8 4.466V.534a.25.25 0 0 0-.41-.192L5.23 2.308a.25.25 0 0 0 0 .384l2.36 1.966A.25.25 0 0 0 8 4.466z"/>
</svg>
</button>
</div>
</div>
<div class="card-body p-4">
<div class="row g-3 mb-4">
<div class="col-6 col-md-4 col-lg-2">
<div class="text-center p-3 bg-light rounded h-100">
<h4 class="fw-bold mb-1" id="opTotalRequests">0</h4>
<small class="text-muted">Requests</small>
</div>
</div>
<div class="col-6 col-md-4 col-lg-2">
<div class="text-center p-3 bg-light rounded h-100">
<h4 class="fw-bold mb-1 text-success" id="opSuccessRate">0%</h4>
<small class="text-muted">Success</small>
</div>
</div>
<div class="col-6 col-md-4 col-lg-2">
<div class="text-center p-3 bg-light rounded h-100">
<h4 class="fw-bold mb-1 text-danger" id="opErrorCount">0</h4>
<small class="text-muted">Errors</small>
</div>
</div>
<div class="col-6 col-md-4 col-lg-2">
<div class="text-center p-3 bg-light rounded h-100">
<h4 class="fw-bold mb-1 text-info" id="opAvgLatency">0ms</h4>
<small class="text-muted">Latency</small>
</div>
</div>
<div class="col-6 col-md-4 col-lg-2">
<div class="text-center p-3 bg-light rounded h-100">
<h4 class="fw-bold mb-1 text-primary" id="opBytesIn">0 B</h4>
<small class="text-muted">Bytes In</small>
</div>
</div>
<div class="col-6 col-md-4 col-lg-2">
<div class="text-center p-3 bg-light rounded h-100">
<h4 class="fw-bold mb-1 text-secondary" id="opBytesOut">0 B</h4>
<small class="text-muted">Bytes Out</small>
</div>
</div>
</div>
<div class="row g-4">
<div class="col-lg-6">
<div class="bg-light rounded p-3">
<h6 class="text-muted small fw-bold text-uppercase mb-3">Requests by Method</h6>
<div style="height: 220px; display: flex; align-items: center; justify-content: center;">
<canvas id="methodChart"></canvas>
</div>
</div>
</div>
<div class="col-lg-6">
<div class="bg-light rounded p-3">
<h6 class="text-muted small fw-bold text-uppercase mb-3">Requests by Status</h6>
<div style="height: 220px;">
<canvas id="statusChart"></canvas>
</div>
</div>
</div>
</div>
<div class="row g-4 mt-1">
<div class="col-lg-6">
<div class="bg-light rounded p-3">
<h6 class="text-muted small fw-bold text-uppercase mb-3">Requests by Endpoint</h6>
<div style="height: 180px;">
<canvas id="endpointChart"></canvas>
</div>
</div>
</div>
<div class="col-lg-6">
<div class="bg-light rounded p-3 h-100 d-flex flex-column">
<div class="d-flex justify-content-between align-items-start mb-3">
<h6 class="text-muted small fw-bold text-uppercase mb-0">S3 Error Codes</h6>
<span class="badge bg-secondary-subtle text-secondary" style="font-size: 0.65rem;" title="Tracks S3 API errors like NoSuchKey, AccessDenied, etc.">API Only</span>
</div>
<div class="flex-grow-1 d-flex flex-column" style="min-height: 150px;">
<div class="d-flex border-bottom pb-2 mb-2" style="font-size: 0.75rem;">
<div class="text-muted fw-semibold" style="flex: 1;">Code</div>
<div class="text-muted fw-semibold text-end" style="width: 60px;">Count</div>
<div class="text-muted fw-semibold text-end" style="width: 100px;">Distribution</div>
</div>
<div id="errorCodesContainer" class="flex-grow-1" style="overflow-y: auto;">
<div id="errorCodesBody">
<div class="text-muted small text-center py-4">
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="currentColor" class="bi bi-check-circle mb-2 text-success" viewBox="0 0 16 16">
<path d="M8 15A7 7 0 1 1 8 1a7 7 0 0 1 0 14zm0 1A8 8 0 1 0 8 0a8 8 0 0 0 0 16z"/>
<path d="M10.97 4.97a.235.235 0 0 0-.02.022L7.477 9.417 5.384 7.323a.75.75 0 0 0-1.06 1.06L6.97 11.03a.75.75 0 0 0 1.079-.02l3.992-4.99a.75.75 0 0 0-1.071-1.05z"/>
</svg>
<div>No S3 API errors</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
{% endif %}
{% if metrics_history_enabled %}
<div class="row g-4 mt-2">
<div class="col-12">
@@ -307,7 +422,7 @@
{% endblock %}
{% block extra_scripts %}
{% if metrics_history_enabled %}
{% if metrics_history_enabled or operation_metrics_enabled %}
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
{% endif %}
<script>
@@ -413,6 +528,237 @@
startPolling();
})();
{% if operation_metrics_enabled %}
(function() {
var methodChart = null;
var statusChart = null;
var endpointChart = null;
var opStatus = document.getElementById('opStatus');
var opTimer = null;
var methodColors = {
'GET': '#0d6efd',
'PUT': '#198754',
'POST': '#ffc107',
'DELETE': '#dc3545',
'HEAD': '#6c757d',
'OPTIONS': '#0dcaf0'
};
var statusColors = {
'2xx': '#198754',
'3xx': '#0dcaf0',
'4xx': '#ffc107',
'5xx': '#dc3545'
};
var endpointColors = {
'object': '#0d6efd',
'bucket': '#198754',
'ui': '#6c757d',
'service': '#0dcaf0',
'kms': '#ffc107'
};
function formatBytes(bytes) {
if (bytes === 0) return '0 B';
var k = 1024;
var sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
var i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(1)) + ' ' + sizes[i];
}
function initOpCharts() {
var methodCtx = document.getElementById('methodChart');
var statusCtx = document.getElementById('statusChart');
var endpointCtx = document.getElementById('endpointChart');
if (methodCtx) {
methodChart = new Chart(methodCtx, {
type: 'doughnut',
data: {
labels: [],
datasets: [{
data: [],
backgroundColor: []
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
animation: false,
plugins: {
legend: { position: 'right', labels: { boxWidth: 12, font: { size: 11 } } }
}
}
});
}
if (statusCtx) {
statusChart = new Chart(statusCtx, {
type: 'bar',
data: {
labels: [],
datasets: [{
data: [],
backgroundColor: []
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
animation: false,
plugins: { legend: { display: false } },
scales: {
y: { beginAtZero: true, ticks: { stepSize: 1 } }
}
}
});
}
if (endpointCtx) {
endpointChart = new Chart(endpointCtx, {
type: 'bar',
data: {
labels: [],
datasets: [{
data: [],
backgroundColor: []
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
indexAxis: 'y',
animation: false,
plugins: { legend: { display: false } },
scales: {
x: { beginAtZero: true, ticks: { stepSize: 1 } }
}
}
});
}
}
function updateOpMetrics() {
if (document.hidden) return;
fetch('/ui/metrics/operations')
.then(function(r) { return r.json(); })
.then(function(data) {
if (!data.enabled || !data.stats) {
if (opStatus) opStatus.textContent = 'Operation metrics not available';
return;
}
var stats = data.stats;
var totals = stats.totals || {};
var totalEl = document.getElementById('opTotalRequests');
var successEl = document.getElementById('opSuccessRate');
var errorEl = document.getElementById('opErrorCount');
var latencyEl = document.getElementById('opAvgLatency');
var bytesInEl = document.getElementById('opBytesIn');
var bytesOutEl = document.getElementById('opBytesOut');
if (totalEl) totalEl.textContent = totals.count || 0;
if (successEl) {
var rate = totals.count > 0 ? ((totals.success_count / totals.count) * 100).toFixed(1) : 0;
successEl.textContent = rate + '%';
}
if (errorEl) errorEl.textContent = totals.error_count || 0;
if (latencyEl) latencyEl.textContent = (totals.latency_avg_ms || 0).toFixed(1) + 'ms';
if (bytesInEl) bytesInEl.textContent = formatBytes(totals.bytes_in || 0);
if (bytesOutEl) bytesOutEl.textContent = formatBytes(totals.bytes_out || 0);
if (methodChart && stats.by_method) {
var methods = Object.keys(stats.by_method);
var methodData = methods.map(function(m) { return stats.by_method[m].count; });
var methodBg = methods.map(function(m) { return methodColors[m] || '#6c757d'; });
methodChart.data.labels = methods;
methodChart.data.datasets[0].data = methodData;
methodChart.data.datasets[0].backgroundColor = methodBg;
methodChart.update('none');
}
if (statusChart && stats.by_status_class) {
var statuses = Object.keys(stats.by_status_class).sort();
var statusData = statuses.map(function(s) { return stats.by_status_class[s]; });
var statusBg = statuses.map(function(s) { return statusColors[s] || '#6c757d'; });
statusChart.data.labels = statuses;
statusChart.data.datasets[0].data = statusData;
statusChart.data.datasets[0].backgroundColor = statusBg;
statusChart.update('none');
}
if (endpointChart && stats.by_endpoint) {
var endpoints = Object.keys(stats.by_endpoint);
var endpointData = endpoints.map(function(e) { return stats.by_endpoint[e].count; });
var endpointBg = endpoints.map(function(e) { return endpointColors[e] || '#6c757d'; });
endpointChart.data.labels = endpoints;
endpointChart.data.datasets[0].data = endpointData;
endpointChart.data.datasets[0].backgroundColor = endpointBg;
endpointChart.update('none');
}
var errorBody = document.getElementById('errorCodesBody');
if (errorBody && stats.error_codes) {
var errorCodes = Object.entries(stats.error_codes);
errorCodes.sort(function(a, b) { return b[1] - a[1]; });
var totalErrors = errorCodes.reduce(function(sum, e) { return sum + e[1]; }, 0);
errorCodes = errorCodes.slice(0, 10);
if (errorCodes.length === 0) {
errorBody.innerHTML = '<div class="text-muted small text-center py-4">' +
'<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="currentColor" class="bi bi-check-circle mb-2 text-success" viewBox="0 0 16 16">' +
'<path d="M8 15A7 7 0 1 1 8 1a7 7 0 0 1 0 14zm0 1A8 8 0 1 0 8 0a8 8 0 0 0 0 16z"/>' +
'<path d="M10.97 4.97a.235.235 0 0 0-.02.022L7.477 9.417 5.384 7.323a.75.75 0 0 0-1.06 1.06L6.97 11.03a.75.75 0 0 0 1.079-.02l3.992-4.99a.75.75 0 0 0-1.071-1.05z"/>' +
'</svg><div>No S3 API errors</div></div>';
} else {
errorBody.innerHTML = errorCodes.map(function(e) {
var pct = totalErrors > 0 ? ((e[1] / totalErrors) * 100).toFixed(0) : 0;
return '<div class="d-flex align-items-center py-1" style="font-size: 0.8rem;">' +
'<div style="flex: 1;"><code class="text-danger">' + e[0] + '</code></div>' +
'<div class="text-end fw-semibold" style="width: 60px;">' + e[1] + '</div>' +
'<div style="width: 100px; padding-left: 10px;"><div class="progress" style="height: 6px;"><div class="progress-bar bg-danger" style="width: ' + pct + '%"></div></div></div>' +
'</div>';
}).join('');
}
}
var windowMins = Math.floor(stats.window_seconds / 60);
var windowSecs = stats.window_seconds % 60;
var windowStr = windowMins > 0 ? windowMins + 'm ' + windowSecs + 's' : windowSecs + 's';
if (opStatus) opStatus.textContent = 'Window: ' + windowStr + ' | ' + new Date().toLocaleTimeString();
})
.catch(function(err) {
console.error('Operation metrics fetch error:', err);
if (opStatus) opStatus.textContent = 'Failed to load';
});
}
function startOpPolling() {
if (opTimer) clearInterval(opTimer);
opTimer = setInterval(updateOpMetrics, 5000);
}
var resetBtn = document.getElementById('resetOpMetricsBtn');
if (resetBtn) {
resetBtn.addEventListener('click', function() {
updateOpMetrics();
});
}
document.addEventListener('visibilitychange', function() {
if (document.hidden) {
if (opTimer) clearInterval(opTimer);
opTimer = null;
} else {
updateOpMetrics();
startOpPolling();
}
});
initOpCharts();
updateOpMetrics();
startOpPolling();
})();
{% endif %}
{% if metrics_history_enabled %}
(function() {
var cpuChart = null;

View File

@@ -0,0 +1,297 @@
import threading
import time
from pathlib import Path
import pytest
from app.operation_metrics import (
OperationMetricsCollector,
OperationStats,
classify_endpoint,
)
class TestOperationStats:
def test_initial_state(self):
stats = OperationStats()
assert stats.count == 0
assert stats.success_count == 0
assert stats.error_count == 0
assert stats.latency_sum_ms == 0.0
assert stats.bytes_in == 0
assert stats.bytes_out == 0
def test_record_success(self):
stats = OperationStats()
stats.record(latency_ms=50.0, success=True, bytes_in=100, bytes_out=200)
assert stats.count == 1
assert stats.success_count == 1
assert stats.error_count == 0
assert stats.latency_sum_ms == 50.0
assert stats.latency_min_ms == 50.0
assert stats.latency_max_ms == 50.0
assert stats.bytes_in == 100
assert stats.bytes_out == 200
def test_record_error(self):
stats = OperationStats()
stats.record(latency_ms=100.0, success=False, bytes_in=50, bytes_out=0)
assert stats.count == 1
assert stats.success_count == 0
assert stats.error_count == 1
def test_latency_min_max(self):
stats = OperationStats()
stats.record(latency_ms=50.0, success=True)
stats.record(latency_ms=10.0, success=True)
stats.record(latency_ms=100.0, success=True)
assert stats.latency_min_ms == 10.0
assert stats.latency_max_ms == 100.0
assert stats.latency_sum_ms == 160.0
def test_to_dict(self):
stats = OperationStats()
stats.record(latency_ms=50.0, success=True, bytes_in=100, bytes_out=200)
stats.record(latency_ms=100.0, success=False, bytes_in=50, bytes_out=0)
result = stats.to_dict()
assert result["count"] == 2
assert result["success_count"] == 1
assert result["error_count"] == 1
assert result["latency_avg_ms"] == 75.0
assert result["latency_min_ms"] == 50.0
assert result["latency_max_ms"] == 100.0
assert result["bytes_in"] == 150
assert result["bytes_out"] == 200
def test_to_dict_empty(self):
stats = OperationStats()
result = stats.to_dict()
assert result["count"] == 0
assert result["latency_avg_ms"] == 0.0
assert result["latency_min_ms"] == 0.0
def test_merge(self):
stats1 = OperationStats()
stats1.record(latency_ms=50.0, success=True, bytes_in=100, bytes_out=200)
stats2 = OperationStats()
stats2.record(latency_ms=10.0, success=True, bytes_in=50, bytes_out=100)
stats2.record(latency_ms=100.0, success=False, bytes_in=25, bytes_out=50)
stats1.merge(stats2)
assert stats1.count == 3
assert stats1.success_count == 2
assert stats1.error_count == 1
assert stats1.latency_min_ms == 10.0
assert stats1.latency_max_ms == 100.0
assert stats1.bytes_in == 175
assert stats1.bytes_out == 350
class TestClassifyEndpoint:
def test_root_path(self):
assert classify_endpoint("/") == "service"
assert classify_endpoint("") == "service"
def test_ui_paths(self):
assert classify_endpoint("/ui") == "ui"
assert classify_endpoint("/ui/buckets") == "ui"
assert classify_endpoint("/ui/metrics") == "ui"
def test_kms_paths(self):
assert classify_endpoint("/kms") == "kms"
assert classify_endpoint("/kms/keys") == "kms"
def test_service_paths(self):
assert classify_endpoint("/myfsio/health") == "service"
def test_bucket_paths(self):
assert classify_endpoint("/mybucket") == "bucket"
assert classify_endpoint("/mybucket/") == "bucket"
def test_object_paths(self):
assert classify_endpoint("/mybucket/mykey") == "object"
assert classify_endpoint("/mybucket/folder/nested/key.txt") == "object"
class TestOperationMetricsCollector:
def test_record_and_get_stats(self, tmp_path: Path):
collector = OperationMetricsCollector(
storage_root=tmp_path,
interval_minutes=60,
retention_hours=24,
)
try:
collector.record_request(
method="GET",
endpoint_type="bucket",
status_code=200,
latency_ms=50.0,
bytes_in=0,
bytes_out=1000,
)
collector.record_request(
method="PUT",
endpoint_type="object",
status_code=201,
latency_ms=100.0,
bytes_in=500,
bytes_out=0,
)
collector.record_request(
method="GET",
endpoint_type="object",
status_code=404,
latency_ms=25.0,
bytes_in=0,
bytes_out=0,
error_code="NoSuchKey",
)
stats = collector.get_current_stats()
assert stats["totals"]["count"] == 3
assert stats["totals"]["success_count"] == 2
assert stats["totals"]["error_count"] == 1
assert "GET" in stats["by_method"]
assert stats["by_method"]["GET"]["count"] == 2
assert "PUT" in stats["by_method"]
assert stats["by_method"]["PUT"]["count"] == 1
assert "bucket" in stats["by_endpoint"]
assert "object" in stats["by_endpoint"]
assert stats["by_endpoint"]["object"]["count"] == 2
assert stats["by_status_class"]["2xx"] == 2
assert stats["by_status_class"]["4xx"] == 1
assert stats["error_codes"]["NoSuchKey"] == 1
finally:
collector.shutdown()
def test_thread_safety(self, tmp_path: Path):
collector = OperationMetricsCollector(
storage_root=tmp_path,
interval_minutes=60,
retention_hours=24,
)
try:
num_threads = 5
requests_per_thread = 100
threads = []
def record_requests():
for _ in range(requests_per_thread):
collector.record_request(
method="GET",
endpoint_type="object",
status_code=200,
latency_ms=10.0,
)
for _ in range(num_threads):
t = threading.Thread(target=record_requests)
threads.append(t)
t.start()
for t in threads:
t.join()
stats = collector.get_current_stats()
assert stats["totals"]["count"] == num_threads * requests_per_thread
finally:
collector.shutdown()
def test_status_class_categorization(self, tmp_path: Path):
collector = OperationMetricsCollector(
storage_root=tmp_path,
interval_minutes=60,
retention_hours=24,
)
try:
collector.record_request("GET", "object", 200, 10.0)
collector.record_request("GET", "object", 204, 10.0)
collector.record_request("GET", "object", 301, 10.0)
collector.record_request("GET", "object", 304, 10.0)
collector.record_request("GET", "object", 400, 10.0)
collector.record_request("GET", "object", 403, 10.0)
collector.record_request("GET", "object", 404, 10.0)
collector.record_request("GET", "object", 500, 10.0)
collector.record_request("GET", "object", 503, 10.0)
stats = collector.get_current_stats()
assert stats["by_status_class"]["2xx"] == 2
assert stats["by_status_class"]["3xx"] == 2
assert stats["by_status_class"]["4xx"] == 3
assert stats["by_status_class"]["5xx"] == 2
finally:
collector.shutdown()
def test_error_code_tracking(self, tmp_path: Path):
collector = OperationMetricsCollector(
storage_root=tmp_path,
interval_minutes=60,
retention_hours=24,
)
try:
collector.record_request("GET", "object", 404, 10.0, error_code="NoSuchKey")
collector.record_request("GET", "object", 404, 10.0, error_code="NoSuchKey")
collector.record_request("GET", "bucket", 403, 10.0, error_code="AccessDenied")
collector.record_request("PUT", "object", 500, 10.0, error_code="InternalError")
stats = collector.get_current_stats()
assert stats["error_codes"]["NoSuchKey"] == 2
assert stats["error_codes"]["AccessDenied"] == 1
assert stats["error_codes"]["InternalError"] == 1
finally:
collector.shutdown()
def test_history_persistence(self, tmp_path: Path):
collector = OperationMetricsCollector(
storage_root=tmp_path,
interval_minutes=60,
retention_hours=24,
)
try:
collector.record_request("GET", "object", 200, 10.0)
collector._take_snapshot()
history = collector.get_history()
assert len(history) == 1
assert history[0]["totals"]["count"] == 1
config_path = tmp_path / ".myfsio.sys" / "config" / "operation_metrics.json"
assert config_path.exists()
finally:
collector.shutdown()
def test_get_history_with_hours_filter(self, tmp_path: Path):
collector = OperationMetricsCollector(
storage_root=tmp_path,
interval_minutes=60,
retention_hours=24,
)
try:
collector.record_request("GET", "object", 200, 10.0)
collector._take_snapshot()
history_all = collector.get_history()
history_recent = collector.get_history(hours=1)
assert len(history_all) >= len(history_recent)
finally:
collector.shutdown()