Improve text extraction in dashboard
This commit is contained in:
206
server.js
206
server.js
@@ -13,6 +13,116 @@ require('dotenv').config();
|
||||
|
||||
// Document processing utilities
|
||||
const crypto = require('crypto');
|
||||
const mammoth = require('mammoth'); // For .docx files
|
||||
const pdfParse = require('pdf-parse'); // For PDF files
|
||||
const ExcelJS = require('exceljs'); // For Excel files
|
||||
|
||||
// Helper function to extract text from various document formats
|
||||
async function extractTextFromDocument(filePath, fileExtension) {
|
||||
try {
|
||||
const extension = fileExtension.toLowerCase();
|
||||
|
||||
switch (extension) {
|
||||
case '.docx':
|
||||
// Extract text from Word documents
|
||||
const docxBuffer = await fs.readFile(filePath);
|
||||
const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
|
||||
return {
|
||||
success: true,
|
||||
text: docxResult.value,
|
||||
extractedLength: docxResult.value.length,
|
||||
method: 'mammoth'
|
||||
};
|
||||
|
||||
case '.pdf':
|
||||
// Extract text from PDF documents
|
||||
const pdfBuffer = await fs.readFile(filePath);
|
||||
const pdfResult = await pdfParse(pdfBuffer);
|
||||
return {
|
||||
success: true,
|
||||
text: pdfResult.text,
|
||||
extractedLength: pdfResult.text.length,
|
||||
method: 'pdf-parse',
|
||||
pages: pdfResult.numpages
|
||||
};
|
||||
|
||||
case '.xlsx':
|
||||
case '.xls':
|
||||
// Extract text from Excel files using ExcelJS
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
await workbook.xlsx.readFile(filePath);
|
||||
let excelText = '';
|
||||
let sheetCount = 0;
|
||||
|
||||
workbook.eachSheet((worksheet, sheetId) => {
|
||||
sheetCount++;
|
||||
excelText += `Sheet: ${worksheet.name}\n`;
|
||||
|
||||
worksheet.eachRow((row, rowNumber) => {
|
||||
const rowText = [];
|
||||
row.eachCell((cell, colNumber) => {
|
||||
// Extract cell value as text
|
||||
let cellValue = cell.value;
|
||||
if (cellValue !== null && cellValue !== undefined) {
|
||||
// Handle different cell types
|
||||
if (typeof cellValue === 'object' && cellValue.text) {
|
||||
// Rich text object
|
||||
cellValue = cellValue.text;
|
||||
} else if (typeof cellValue === 'object' && cellValue.result) {
|
||||
// Formula result
|
||||
cellValue = cellValue.result;
|
||||
}
|
||||
rowText.push(String(cellValue));
|
||||
}
|
||||
});
|
||||
if (rowText.length > 0) {
|
||||
excelText += rowText.join('\t') + '\n';
|
||||
}
|
||||
});
|
||||
excelText += '\n';
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
text: excelText,
|
||||
extractedLength: excelText.length,
|
||||
method: 'exceljs',
|
||||
sheets: sheetCount
|
||||
};
|
||||
|
||||
case '.txt':
|
||||
case '.md':
|
||||
case '.json':
|
||||
case '.js':
|
||||
case '.html':
|
||||
case '.css':
|
||||
case '.xml':
|
||||
case '.csv':
|
||||
// Read text files directly
|
||||
const textContent = await fs.readFile(filePath, 'utf-8');
|
||||
return {
|
||||
success: true,
|
||||
text: textContent,
|
||||
extractedLength: textContent.length,
|
||||
method: 'direct'
|
||||
};
|
||||
|
||||
default:
|
||||
return {
|
||||
success: false,
|
||||
error: `Unsupported file type: ${extension}`,
|
||||
text: null
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error extracting text from document:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
text: null
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Document chunking configuration
|
||||
const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
|
||||
@@ -1017,20 +1127,94 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
|
||||
return res.status(404).json({ success: false, error: 'File not found' });
|
||||
}
|
||||
|
||||
// Read file content
|
||||
const filePath = path.join(__dirname, file.path);
|
||||
const fileContent = await fs.readFile(filePath, 'utf-8');
|
||||
const fileExtension = path.extname(file.originalName).toLowerCase();
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: fileContent
|
||||
// Try to extract text from the document
|
||||
const extractionResult = await extractTextFromDocument(filePath, fileExtension);
|
||||
|
||||
if (extractionResult.success) {
|
||||
// Successfully extracted text
|
||||
const extractedText = extractionResult.text;
|
||||
|
||||
// Limit preview to first 5000 characters to avoid huge responses
|
||||
const previewContent = extractedText.length > 5000 ?
|
||||
extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' :
|
||||
extractedText;
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: previewContent,
|
||||
previewType: 'extracted-text',
|
||||
extractionInfo: {
|
||||
method: extractionResult.method,
|
||||
totalLength: extractionResult.extractedLength,
|
||||
pages: extractionResult.pages || null,
|
||||
sheets: extractionResult.sheets || null,
|
||||
truncated: extractedText.length > 5000
|
||||
},
|
||||
message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// Failed to extract text, fall back to file type detection
|
||||
const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
|
||||
const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
|
||||
|
||||
if (textFormats.includes(fileExtension)) {
|
||||
// Try reading as plain text (should have been handled by extraction, but fallback)
|
||||
try {
|
||||
const fileContent = await fs.readFile(filePath, 'utf-8');
|
||||
const previewContent = fileContent.length > 5000 ?
|
||||
fileContent.substring(0, 5000) + '\n\n... (truncated)' :
|
||||
fileContent;
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: previewContent,
|
||||
previewType: 'text'
|
||||
}
|
||||
});
|
||||
} catch (readError) {
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: 'File preview not available',
|
||||
previewType: 'error',
|
||||
message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
|
||||
}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Binary format that couldn't be processed
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: 'Text extraction failed',
|
||||
previewType: 'extraction-failed',
|
||||
message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error previewing file:', error);
|
||||
res.status(500).json({
|
||||
|
||||
Reference in New Issue
Block a user