Improve text extraction in dashboard

2025-07-08 21:00:17 +08:00
parent a87484b0e7
commit 9644f62dc5
4 changed files with 1193 additions and 22 deletions
--- a/server.js
+++ b/server.js
@@ -13,6 +13,116 @@ require('dotenv').config();

 // Document processing utilities
 const crypto = require('crypto');
+const mammoth = require('mammoth'); // For .docx files
+const pdfParse = require('pdf-parse'); // For PDF files
+const ExcelJS = require('exceljs'); // For Excel files
+
+// Helper function to extract text from various document formats
+async function extractTextFromDocument(filePath, fileExtension) {
+  try {
+    const extension = fileExtension.toLowerCase();
+    
+    switch (extension) {
+      case '.docx':
+        // Extract text from Word documents
+        const docxBuffer = await fs.readFile(filePath);
+        const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
+        return {
+          success: true,
+          text: docxResult.value,
+          extractedLength: docxResult.value.length,
+          method: 'mammoth'
+        };
+        
+      case '.pdf':
+        // Extract text from PDF documents
+        const pdfBuffer = await fs.readFile(filePath);
+        const pdfResult = await pdfParse(pdfBuffer);
+        return {
+          success: true,
+          text: pdfResult.text,
+          extractedLength: pdfResult.text.length,
+          method: 'pdf-parse',
+          pages: pdfResult.numpages
+        };
+        
+      case '.xlsx':
+      case '.xls':
+        // Extract text from Excel files using ExcelJS
+        const workbook = new ExcelJS.Workbook();
+        await workbook.xlsx.readFile(filePath);
+        let excelText = '';
+        let sheetCount = 0;
+        
+        workbook.eachSheet((worksheet, sheetId) => {
+          sheetCount++;
+          excelText += `Sheet: ${worksheet.name}\n`;
+          
+          worksheet.eachRow((row, rowNumber) => {
+            const rowText = [];
+            row.eachCell((cell, colNumber) => {
+              // Extract cell value as text
+              let cellValue = cell.value;
+              if (cellValue !== null && cellValue !== undefined) {
+                // Handle different cell types
+                if (typeof cellValue === 'object' && cellValue.text) {
+                  // Rich text object
+                  cellValue = cellValue.text;
+                } else if (typeof cellValue === 'object' && cellValue.result) {
+                  // Formula result
+                  cellValue = cellValue.result;
+                }
+                rowText.push(String(cellValue));
+              }
+            });
+            if (rowText.length > 0) {
+              excelText += rowText.join('\t') + '\n';
+            }
+          });
+          excelText += '\n';
+        });
+        
+        return {
+          success: true,
+          text: excelText,
+          extractedLength: excelText.length,
+          method: 'exceljs',
+          sheets: sheetCount
+        };
+        
+      case '.txt':
+      case '.md':
+      case '.json':
+      case '.js':
+      case '.html':
+      case '.css':
+      case '.xml':
+      case '.csv':
+        // Read text files directly
+        const textContent = await fs.readFile(filePath, 'utf-8');
+        return {
+          success: true,
+          text: textContent,
+          extractedLength: textContent.length,
+          method: 'direct'
+        };
+        
+      default:
+        return {
+          success: false,
+          error: `Unsupported file type: ${extension}`,
+          text: null
+        };
+    }
+  } catch (error) {
+    console.error('Error extracting text from document:', error);
+    return {
+      success: false,
+      error: error.message,
+      text: null
+    };
+  }
+}

 // Document chunking configuration
 const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
@@ -1017,20 +1127,94 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
      return res.status(404).json({ success: false, error: 'File not found' });
    }

-    // Read file content
    const filePath = path.join(__dirname, file.path);
-    const fileContent = await fs.readFile(filePath, 'utf-8');
+    const fileExtension = path.extname(file.originalName).toLowerCase();
    
-    res.json({
-      success: true,
-      file: {
-        id: file.id,
-        originalName: file.originalName,
-        size: file.size,
-        uploadDate: file.uploadDate,
-        content: fileContent
+    // Try to extract text from the document
+    const extractionResult = await extractTextFromDocument(filePath, fileExtension);
+    
+    if (extractionResult.success) {
+      // Successfully extracted text
+      const extractedText = extractionResult.text;
+      
+      // Limit preview to first 5000 characters to avoid huge responses
+      const previewContent = extractedText.length > 5000 ? 
+        extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' : 
+        extractedText;
+      
+      res.json({
+        success: true,
+        file: {
+          id: file.id,
+          originalName: file.originalName,
+          size: file.size,
+          uploadDate: file.uploadDate,
+          content: previewContent,
+          previewType: 'extracted-text',
+          extractionInfo: {
+            method: extractionResult.method,
+            totalLength: extractionResult.extractedLength,
+            pages: extractionResult.pages || null,
+            sheets: extractionResult.sheets || null,
+            truncated: extractedText.length > 5000
+          },
+          message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
+        }
+      });
+    } else {
+      // Failed to extract text, fall back to file type detection
+      const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
+      const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
+      
+      if (textFormats.includes(fileExtension)) {
+        // Try reading as plain text (should have been handled by extraction, but fallback)
+        try {
+          const fileContent = await fs.readFile(filePath, 'utf-8');
+          const previewContent = fileContent.length > 5000 ? 
+            fileContent.substring(0, 5000) + '\n\n... (truncated)' : 
+            fileContent;
+          
+          res.json({
+            success: true,
+            file: {
+              id: file.id,
+              originalName: file.originalName,
+              size: file.size,
+              uploadDate: file.uploadDate,
+              content: previewContent,
+              previewType: 'text'
+            }
+          });
+        } catch (readError) {
+          res.json({
+            success: true,
+            file: {
+              id: file.id,
+              originalName: file.originalName,
+              size: file.size,
+              uploadDate: file.uploadDate,
+              content: 'File preview not available',
+              previewType: 'error',
+              message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
+            }
+          });
+        }
+      } else {
+        // Binary format that couldn't be processed
+        res.json({
+          success: true,
+          file: {
+            id: file.id,
+            originalName: file.originalName,
+            size: file.size,
+            uploadDate: file.uploadDate,
+            content: 'Text extraction failed',
+            previewType: 'extraction-failed',
+            message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
+          }
+        });
      }
-    });
+    }
  } catch (error) {
    console.error('Error previewing file:', error);
    res.status(500).json({