Improve text extraction in dashboard

2025-07-08 21:00:17 +08:00
parent a87484b0e7
commit 9644f62dc5
4 changed files with 1193 additions and 22 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -15,11 +15,14 @@
    "cors": "^2.8.5",
    "dotenv": "^16.4.5",
    "ejs": "^3.1.10",
    "exceljs": "^4.4.0",
    "express": "^4.19.2",
    "express-session": "^1.18.0",
    "form-data": "^4.0.3",
    "fs-extra": "^11.2.0",
    "mammoth": "^1.9.1",
    "multer": "^2.0.0",
    "pdf-parse": "^1.1.1",
    "uuid": "^10.0.0"
  },
  "devDependencies": {
--- a/server.js
+++ b/server.js
@@ -13,6 +13,116 @@ require('dotenv').config();
 // Document processing utilities
 const crypto = require('crypto');
 const mammoth = require('mammoth'); // For .docx files
 const pdfParse = require('pdf-parse'); // For PDF files
 const ExcelJS = require('exceljs'); // For Excel files
 // Helper function to extract text from various document formats
 async function extractTextFromDocument(filePath, fileExtension) {
  try {
    const extension = fileExtension.toLowerCase();
    switch (extension) {
      case '.docx':
        // Extract text from Word documents
        const docxBuffer = await fs.readFile(filePath);
        const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
        return {
          success: true,
          text: docxResult.value,
          extractedLength: docxResult.value.length,
          method: 'mammoth'
        };
      case '.pdf':
        // Extract text from PDF documents
        const pdfBuffer = await fs.readFile(filePath);
        const pdfResult = await pdfParse(pdfBuffer);
        return {
          success: true,
          text: pdfResult.text,
          extractedLength: pdfResult.text.length,
          method: 'pdf-parse',
          pages: pdfResult.numpages
        };
      case '.xlsx':
      case '.xls':
        // Extract text from Excel files using ExcelJS
        const workbook = new ExcelJS.Workbook();
        await workbook.xlsx.readFile(filePath);
        let excelText = '';
        let sheetCount = 0;
        workbook.eachSheet((worksheet, sheetId) => {
          sheetCount++;
          excelText += `Sheet: ${worksheet.name}\n`;
          worksheet.eachRow((row, rowNumber) => {
            const rowText = [];
            row.eachCell((cell, colNumber) => {
              // Extract cell value as text
              let cellValue = cell.value;
              if (cellValue !== null && cellValue !== undefined) {
                // Handle different cell types
                if (typeof cellValue === 'object' && cellValue.text) {
                  // Rich text object
                  cellValue = cellValue.text;
                } else if (typeof cellValue === 'object' && cellValue.result) {
                  // Formula result
                  cellValue = cellValue.result;
                }
                rowText.push(String(cellValue));
              }
            });
            if (rowText.length > 0) {
              excelText += rowText.join('\t') + '\n';
            }
          });
          excelText += '\n';
        });
        return {
          success: true,
          text: excelText,
          extractedLength: excelText.length,
          method: 'exceljs',
          sheets: sheetCount
        };
      case '.txt':
      case '.md':
      case '.json':
      case '.js':
      case '.html':
      case '.css':
      case '.xml':
      case '.csv':
        // Read text files directly
        const textContent = await fs.readFile(filePath, 'utf-8');
        return {
          success: true,
          text: textContent,
          extractedLength: textContent.length,
          method: 'direct'
        };
      default:
        return {
          success: false,
          error: `Unsupported file type: ${extension}`,
          text: null
        };
    }
  } catch (error) {
    console.error('Error extracting text from document:', error);
    return {
      success: false,
      error: error.message,
      text: null
    };
  }
 }
 // Document chunking configuration
 const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
@@ -1017,20 +1127,94 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
      return res.status(404).json({ success: false, error: 'File not found' });
    }
    // Read file content
    const filePath = path.join(__dirname, file.path);
-    const fileContent = await fs.readFile(filePath, 'utf-8');
+    const fileExtension = path.extname(file.originalName).toLowerCase();
-    res.json({
+    // Try to extract text from the document
-      success: true,
+    const extractionResult = await extractTextFromDocument(filePath, fileExtension);
-      file: {
+    
-        id: file.id,
+    if (extractionResult.success) {
-        originalName: file.originalName,
+      // Successfully extracted text
-        size: file.size,
+      const extractedText = extractionResult.text;
-        uploadDate: file.uploadDate,
+      
-        content: fileContent
+      // Limit preview to first 5000 characters to avoid huge responses
      const previewContent = extractedText.length > 5000 ? 
        extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' : 
        extractedText;
      res.json({
        success: true,
        file: {
          id: file.id,
          originalName: file.originalName,
          size: file.size,
          uploadDate: file.uploadDate,
          content: previewContent,
          previewType: 'extracted-text',
          extractionInfo: {
            method: extractionResult.method,
            totalLength: extractionResult.extractedLength,
            pages: extractionResult.pages || null,
            sheets: extractionResult.sheets || null,
            truncated: extractedText.length > 5000
          },
          message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
        }
      });
    } else {
      // Failed to extract text, fall back to file type detection
      const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
      const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
      if (textFormats.includes(fileExtension)) {
        // Try reading as plain text (should have been handled by extraction, but fallback)
        try {
          const fileContent = await fs.readFile(filePath, 'utf-8');
          const previewContent = fileContent.length > 5000 ? 
            fileContent.substring(0, 5000) + '\n\n... (truncated)' : 
            fileContent;
          res.json({
            success: true,
            file: {
              id: file.id,
              originalName: file.originalName,
              size: file.size,
              uploadDate: file.uploadDate,
              content: previewContent,
              previewType: 'text'
            }
          });
        } catch (readError) {
          res.json({
            success: true,
            file: {
              id: file.id,
              originalName: file.originalName,
              size: file.size,
              uploadDate: file.uploadDate,
              content: 'File preview not available',
              previewType: 'error',
              message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
            }
          });
        }
      } else {
        // Binary format that couldn't be processed
        res.json({
          success: true,
          file: {
            id: file.id,
            originalName: file.originalName,
            size: file.size,
            uploadDate: file.uploadDate,
            content: 'Text extraction failed',
            previewType: 'extraction-failed',
            message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
          }
        });
      }
-    });
+    }
  } catch (error) {
    console.error('Error previewing file:', error);
    res.status(500).json({ 
--- a/views/dashboard.ejs
+++ b/views/dashboard.ejs
@@ -181,10 +181,8 @@ function previewFile(fileId) {
                const file = result.file;
                let content = '';
-                // Check file type and format content accordingly
+                // Handle different preview types
-                const fileExtension = file.originalName.split('.').pop().toLowerCase();
+                if (file.previewType === 'text') {
                if (['txt', 'md', 'json', 'js', 'html', 'css', 'py', 'java', 'cpp', 'c'].includes(fileExtension)) {
                    // Text-based files - show with syntax highlighting
                    content = `
                        <div class="mb-3">
@@ -198,8 +196,35 @@ function previewFile(fileId) {
                            <pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;"><code>${escapeHtml(file.content)}</code></pre>
                        </div>
                    `;
-                } else if (['pdf', 'doc', 'docx'].includes(fileExtension)) {
+                } else if (file.previewType === 'extracted-text') {
-                    // Document files - show basic info and content preview
+                    // Successfully extracted text from document
                    const extractionInfo = file.extractionInfo || {};
                    const infoText = [];
                    if (extractionInfo.pages) infoText.push(`${extractionInfo.pages} pages`);
                    if (extractionInfo.sheets) infoText.push(`${extractionInfo.sheets} sheets`);
                    if (extractionInfo.totalLength) infoText.push(`${extractionInfo.totalLength} characters extracted`);
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file-word me-2"></i>${file.originalName}</h6>
                            <small class="text-muted">
                                Size: ${Math.round(file.size / 1024)} KB | 
                                Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
                                ${infoText.length > 0 ? ' | ' + infoText.join(', ') : ''}
                            </small>
                        </div>
                        <div class="alert alert-success">
                            <i class="fas fa-check-circle me-2"></i>
                            ${file.message || 'Text successfully extracted from document'}
                        </div>
                        <div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
                            <pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
                        </div>
                        ${extractionInfo.truncated ? '<small class="text-muted mt-2 d-block"><i class="fas fa-info-circle me-1"></i>Full document content is available for AI processing</small>' : ''}
                    `;
                } else if (file.previewType === 'binary') {
                    // Binary files - show info message
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file-pdf me-2"></i>${file.originalName}</h6>
@@ -210,14 +235,47 @@ function previewFile(fileId) {
                        </div>
                        <div class="alert alert-info">
                            <i class="fas fa-info-circle me-2"></i>
-                            Document preview: First few lines of extracted text
+                            ${file.message || 'This is a binary file that has been processed for AI use.'}
                        </div>
-                        <div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
+                        ${file.content && file.content !== 'File preview not available' ? `
-                            <pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content.substring(0, 1000))}${file.content.length > 1000 ? '...' : ''}</pre>
+                            <div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
                                <small class="text-muted">Extracted content preview:</small>
                                <pre style="margin: 0; margin-top: 10px; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
                            </div>
                        ` : ''}
                    `;
                } else if (file.previewType === 'extraction-failed') {
                    // Failed to extract text
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file-exclamation me-2"></i>${file.originalName}</h6>
                            <small class="text-muted">
                                Size: ${Math.round(file.size / 1024)} KB | 
                                Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
                            </small>
                        </div>
                        <div class="alert alert-warning">
                            <i class="fas fa-exclamation-triangle me-2"></i>
                            ${file.message || 'Could not extract text from this file.'}
                        </div>
                    `;
                } else if (file.previewType === 'error') {
                    // Error reading file
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file-times me-2"></i>${file.originalName}</h6>
                            <small class="text-muted">
                                Size: ${Math.round(file.size / 1024)} KB | 
                                Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
                            </small>
                        </div>
                        <div class="alert alert-danger">
                            <i class="fas fa-exclamation-circle me-2"></i>
                            ${file.message || 'Error reading file.'}
                        </div>
                    `;
                } else {
-                    // Other files - show basic info
+                    // Fallback for unknown preview types
                    content = `
                        <div class="mb-3">
                            <h6><i class="fas fa-file me-2"></i>${file.originalName}</h6>