Improve text extraction in dashboard

This commit is contained in:
inubimambo
2025-07-08 21:00:17 +08:00
parent a87484b0e7
commit 9644f62dc5
4 changed files with 1193 additions and 22 deletions

206
server.js
View File

@@ -13,6 +13,116 @@ require('dotenv').config();
// Document processing utilities
const crypto = require('crypto');
const mammoth = require('mammoth'); // For .docx files
const pdfParse = require('pdf-parse'); // For PDF files
const ExcelJS = require('exceljs'); // For Excel files
// Helper function to extract text from various document formats
async function extractTextFromDocument(filePath, fileExtension) {
try {
const extension = fileExtension.toLowerCase();
switch (extension) {
case '.docx':
// Extract text from Word documents
const docxBuffer = await fs.readFile(filePath);
const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
return {
success: true,
text: docxResult.value,
extractedLength: docxResult.value.length,
method: 'mammoth'
};
case '.pdf':
// Extract text from PDF documents
const pdfBuffer = await fs.readFile(filePath);
const pdfResult = await pdfParse(pdfBuffer);
return {
success: true,
text: pdfResult.text,
extractedLength: pdfResult.text.length,
method: 'pdf-parse',
pages: pdfResult.numpages
};
case '.xlsx':
case '.xls':
// Extract text from Excel files using ExcelJS
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(filePath);
let excelText = '';
let sheetCount = 0;
workbook.eachSheet((worksheet, sheetId) => {
sheetCount++;
excelText += `Sheet: ${worksheet.name}\n`;
worksheet.eachRow((row, rowNumber) => {
const rowText = [];
row.eachCell((cell, colNumber) => {
// Extract cell value as text
let cellValue = cell.value;
if (cellValue !== null && cellValue !== undefined) {
// Handle different cell types
if (typeof cellValue === 'object' && cellValue.text) {
// Rich text object
cellValue = cellValue.text;
} else if (typeof cellValue === 'object' && cellValue.result) {
// Formula result
cellValue = cellValue.result;
}
rowText.push(String(cellValue));
}
});
if (rowText.length > 0) {
excelText += rowText.join('\t') + '\n';
}
});
excelText += '\n';
});
return {
success: true,
text: excelText,
extractedLength: excelText.length,
method: 'exceljs',
sheets: sheetCount
};
case '.txt':
case '.md':
case '.json':
case '.js':
case '.html':
case '.css':
case '.xml':
case '.csv':
// Read text files directly
const textContent = await fs.readFile(filePath, 'utf-8');
return {
success: true,
text: textContent,
extractedLength: textContent.length,
method: 'direct'
};
default:
return {
success: false,
error: `Unsupported file type: ${extension}`,
text: null
};
}
} catch (error) {
console.error('Error extracting text from document:', error);
return {
success: false,
error: error.message,
text: null
};
}
}
// Document chunking configuration
const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
@@ -1017,20 +1127,94 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
return res.status(404).json({ success: false, error: 'File not found' });
}
// Read file content
const filePath = path.join(__dirname, file.path);
const fileContent = await fs.readFile(filePath, 'utf-8');
const fileExtension = path.extname(file.originalName).toLowerCase();
res.json({
success: true,
file: {
id: file.id,
originalName: file.originalName,
size: file.size,
uploadDate: file.uploadDate,
content: fileContent
// Try to extract text from the document
const extractionResult = await extractTextFromDocument(filePath, fileExtension);
if (extractionResult.success) {
// Successfully extracted text
const extractedText = extractionResult.text;
// Limit preview to first 5000 characters to avoid huge responses
const previewContent = extractedText.length > 5000 ?
extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' :
extractedText;
res.json({
success: true,
file: {
id: file.id,
originalName: file.originalName,
size: file.size,
uploadDate: file.uploadDate,
content: previewContent,
previewType: 'extracted-text',
extractionInfo: {
method: extractionResult.method,
totalLength: extractionResult.extractedLength,
pages: extractionResult.pages || null,
sheets: extractionResult.sheets || null,
truncated: extractedText.length > 5000
},
message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
}
});
} else {
// Failed to extract text, fall back to file type detection
const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
if (textFormats.includes(fileExtension)) {
// Try reading as plain text (should have been handled by extraction, but fallback)
try {
const fileContent = await fs.readFile(filePath, 'utf-8');
const previewContent = fileContent.length > 5000 ?
fileContent.substring(0, 5000) + '\n\n... (truncated)' :
fileContent;
res.json({
success: true,
file: {
id: file.id,
originalName: file.originalName,
size: file.size,
uploadDate: file.uploadDate,
content: previewContent,
previewType: 'text'
}
});
} catch (readError) {
res.json({
success: true,
file: {
id: file.id,
originalName: file.originalName,
size: file.size,
uploadDate: file.uploadDate,
content: 'File preview not available',
previewType: 'error',
message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
}
});
}
} else {
// Binary format that couldn't be processed
res.json({
success: true,
file: {
id: file.id,
originalName: file.originalName,
size: file.size,
uploadDate: file.uploadDate,
content: 'Text extraction failed',
previewType: 'extraction-failed',
message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
}
});
}
});
}
} catch (error) {
console.error('Error previewing file:', error);
res.status(500).json({