Improve text extraction in dashboard

2025-07-08 21:00:17 +08:00
parent a87484b0e7
commit 9644f62dc5
4 changed files with 1193 additions and 22 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -15,11 +15,14 @@
    "cors": "^2.8.5",
    "dotenv": "^16.4.5",
    "ejs": "^3.1.10",
+    "exceljs": "^4.4.0",
    "express": "^4.19.2",
    "express-session": "^1.18.0",
    "form-data": "^4.0.3",
    "fs-extra": "^11.2.0",
+    "mammoth": "^1.9.1",
    "multer": "^2.0.0",
+    "pdf-parse": "^1.1.1",
    "uuid": "^10.0.0"
  },
  "devDependencies": {
--- a/server.js
+++ b/server.js
@@ -13,6 +13,116 @@ require('dotenv').config();

 // Document processing utilities
 const crypto = require('crypto');
+const mammoth = require('mammoth'); // For .docx files
+const pdfParse = require('pdf-parse'); // For PDF files
+const ExcelJS = require('exceljs'); // For Excel files
+
+// Helper function to extract text from various document formats
+async function extractTextFromDocument(filePath, fileExtension) {
+  try {
+    const extension = fileExtension.toLowerCase();
+    
+    switch (extension) {
+      case '.docx':
+        // Extract text from Word documents
+        const docxBuffer = await fs.readFile(filePath);
+        const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
+        return {
+          success: true,
+          text: docxResult.value,
+          extractedLength: docxResult.value.length,
+          method: 'mammoth'
+        };
+        
+      case '.pdf':
+        // Extract text from PDF documents
+        const pdfBuffer = await fs.readFile(filePath);
+        const pdfResult = await pdfParse(pdfBuffer);
+        return {
+          success: true,
+          text: pdfResult.text,
+          extractedLength: pdfResult.text.length,
+          method: 'pdf-parse',
+          pages: pdfResult.numpages
+        };
+        
+      case '.xlsx':
+      case '.xls':
+        // Extract text from Excel files using ExcelJS
+        const workbook = new ExcelJS.Workbook();
+        await workbook.xlsx.readFile(filePath);
+        let excelText = '';
+        let sheetCount = 0;
+        
+        workbook.eachSheet((worksheet, sheetId) => {
+          sheetCount++;
+          excelText += `Sheet: ${worksheet.name}\n`;
+          
+          worksheet.eachRow((row, rowNumber) => {
+            const rowText = [];
+            row.eachCell((cell, colNumber) => {
+              // Extract cell value as text
+              let cellValue = cell.value;
+              if (cellValue !== null && cellValue !== undefined) {
+                // Handle different cell types
+                if (typeof cellValue === 'object' && cellValue.text) {
+                  // Rich text object
+                  cellValue = cellValue.text;
+                } else if (typeof cellValue === 'object' && cellValue.result) {
+                  // Formula result
+                  cellValue = cellValue.result;
+                }
+                rowText.push(String(cellValue));
+              }
+            });
+            if (rowText.length > 0) {
+              excelText += rowText.join('\t') + '\n';
+            }
+          });
+          excelText += '\n';
+        });
+        
+        return {
+          success: true,
+          text: excelText,
+          extractedLength: excelText.length,
+          method: 'exceljs',
+          sheets: sheetCount
+        };
+        
+      case '.txt':
+      case '.md':
+      case '.json':
+      case '.js':
+      case '.html':
+      case '.css':
+      case '.xml':
+      case '.csv':
+        // Read text files directly
+        const textContent = await fs.readFile(filePath, 'utf-8');
+        return {
+          success: true,
+          text: textContent,
+          extractedLength: textContent.length,
+          method: 'direct'
+        };
+        
+      default:
+        return {
+          success: false,
+          error: `Unsupported file type: ${extension}`,
+          text: null
+        };
+    }
+  } catch (error) {
+    console.error('Error extracting text from document:', error);
+    return {
+      success: false,
+      error: error.message,
+      text: null
+    };
+  }
+}

 // Document chunking configuration
 const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
@@ -1017,9 +1127,20 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
      return res.status(404).json({ success: false, error: 'File not found' });
    }

-    // Read file content
    const filePath = path.join(__dirname, file.path);
-    const fileContent = await fs.readFile(filePath, 'utf-8');
+    const fileExtension = path.extname(file.originalName).toLowerCase();
+    
+    // Try to extract text from the document
+    const extractionResult = await extractTextFromDocument(filePath, fileExtension);
+    
+    if (extractionResult.success) {
+      // Successfully extracted text
+      const extractedText = extractionResult.text;
+      
+      // Limit preview to first 5000 characters to avoid huge responses
+      const previewContent = extractedText.length > 5000 ? 
+        extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' : 
+        extractedText;
      
      res.json({
        success: true,
@@ -1028,9 +1149,72 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
          originalName: file.originalName,
          size: file.size,
          uploadDate: file.uploadDate,
-        content: fileContent
+          content: previewContent,
+          previewType: 'extracted-text',
+          extractionInfo: {
+            method: extractionResult.method,
+            totalLength: extractionResult.extractedLength,
+            pages: extractionResult.pages || null,
+            sheets: extractionResult.sheets || null,
+            truncated: extractedText.length > 5000
+          },
+          message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
        }
      });
+    } else {
+      // Failed to extract text, fall back to file type detection
+      const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
+      const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
+      
+      if (textFormats.includes(fileExtension)) {
+        // Try reading as plain text (should have been handled by extraction, but fallback)
+        try {
+          const fileContent = await fs.readFile(filePath, 'utf-8');
+          const previewContent = fileContent.length > 5000 ? 
+            fileContent.substring(0, 5000) + '\n\n... (truncated)' : 
+            fileContent;
+          
+          res.json({
+            success: true,
+            file: {
+              id: file.id,
+              originalName: file.originalName,
+              size: file.size,
+              uploadDate: file.uploadDate,
+              content: previewContent,
+              previewType: 'text'
+            }
+          });
+        } catch (readError) {
+          res.json({
+            success: true,
+            file: {
+              id: file.id,
+              originalName: file.originalName,
+              size: file.size,
+              uploadDate: file.uploadDate,
+              content: 'File preview not available',
+              previewType: 'error',
+              message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
+            }
+          });
+        }
+      } else {
+        // Binary format that couldn't be processed
+        res.json({
+          success: true,
+          file: {
+            id: file.id,
+            originalName: file.originalName,
+            size: file.size,
+            uploadDate: file.uploadDate,
+            content: 'Text extraction failed',
+            previewType: 'extraction-failed',
+            message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
+          }
+        });
+      }
+    }
  } catch (error) {
    console.error('Error previewing file:', error);
    res.status(500).json({ 
--- a/views/dashboard.ejs
+++ b/views/dashboard.ejs
@@ -181,10 +181,8 @@ function previewFile(fileId) {
                const file = result.file;
                let content = '';
                
-                // Check file type and format content accordingly
-                const fileExtension = file.originalName.split('.').pop().toLowerCase();
-                
-                if (['txt', 'md', 'json', 'js', 'html', 'css', 'py', 'java', 'cpp', 'c'].includes(fileExtension)) {
+                // Handle different preview types
+                if (file.previewType === 'text') {
                    // Text-based files - show with syntax highlighting
                    content = `
                        <div class="mb-3">
@@ -198,8 +196,35 @@ function previewFile(fileId) {
                            <pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;"><code>${escapeHtml(file.content)}</code></pre>
                        </div>
                    `;
-                } else if (['pdf', 'doc', 'docx'].includes(fileExtension)) {
-                    // Document files - show basic info and content preview
+                } else if (file.previewType === 'extracted-text') {
+                    // Successfully extracted text from document
+                    const extractionInfo = file.extractionInfo || {};
+                    const infoText = [];
+                    
+                    if (extractionInfo.pages) infoText.push(`${extractionInfo.pages} pages`);
+                    if (extractionInfo.sheets) infoText.push(`${extractionInfo.sheets} sheets`);
+                    if (extractionInfo.totalLength) infoText.push(`${extractionInfo.totalLength} characters extracted`);
+                    
+                    content = `
+                        <div class="mb-3">
+                            <h6><i class="fas fa-file-word me-2"></i>${file.originalName}</h6>
+                            <small class="text-muted">
+                                Size: ${Math.round(file.size / 1024)} KB | 
+                                Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
+                                ${infoText.length > 0 ? ' | ' + infoText.join(', ') : ''}
+                            </small>
+                        </div>
+                        <div class="alert alert-success">
+                            <i class="fas fa-check-circle me-2"></i>
+                            ${file.message || 'Text successfully extracted from document'}
+                        </div>
+                        <div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
+                            <pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
+                        </div>
+                        ${extractionInfo.truncated ? '<small class="text-muted mt-2 d-block"><i class="fas fa-info-circle me-1"></i>Full document content is available for AI processing</small>' : ''}
+                    `;
+                } else if (file.previewType === 'binary') {
+                    // Binary files - show info message
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file-pdf me-2"></i>${file.originalName}</h6>
@@ -210,14 +235,47 @@ function previewFile(fileId) {
                        </div>
                        <div class="alert alert-info">
                            <i class="fas fa-info-circle me-2"></i>
-                            Document preview: First few lines of extracted text
+                            ${file.message || 'This is a binary file that has been processed for AI use.'}
                        </div>
+                        ${file.content && file.content !== 'File preview not available' ? `
                            <div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
-                            <pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content.substring(0, 1000))}${file.content.length > 1000 ? '...' : ''}</pre>
+                                <small class="text-muted">Extracted content preview:</small>
+                                <pre style="margin: 0; margin-top: 10px; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
+                            </div>
+                        ` : ''}
+                    `;
+                } else if (file.previewType === 'extraction-failed') {
+                    // Failed to extract text
+                    content = `
+                        <div class="mb-3">
+                            <h6><i class="fas fa-file-exclamation me-2"></i>${file.originalName}</h6>
+                            <small class="text-muted">
+                                Size: ${Math.round(file.size / 1024)} KB | 
+                                Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
+                            </small>
+                        </div>
+                        <div class="alert alert-warning">
+                            <i class="fas fa-exclamation-triangle me-2"></i>
+                            ${file.message || 'Could not extract text from this file.'}
+                        </div>
+                    `;
+                } else if (file.previewType === 'error') {
+                    // Error reading file
+                    content = `
+                        <div class="mb-3">
+                            <h6><i class="fas fa-file-times me-2"></i>${file.originalName}</h6>
+                            <small class="text-muted">
+                                Size: ${Math.round(file.size / 1024)} KB | 
+                                Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
+                            </small>
+                        </div>
+                        <div class="alert alert-danger">
+                            <i class="fas fa-exclamation-circle me-2"></i>
+                            ${file.message || 'Error reading file.'}
                        </div>
                    `;
                } else {
-                    // Other files - show basic info
+                    // Fallback for unknown preview types
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file me-2"></i>${file.originalName}</h6>