Improve text extraction in dashboard
This commit is contained in:
928
package-lock.json
generated
928
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -15,11 +15,14 @@
|
|||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dotenv": "^16.4.5",
|
"dotenv": "^16.4.5",
|
||||||
"ejs": "^3.1.10",
|
"ejs": "^3.1.10",
|
||||||
|
"exceljs": "^4.4.0",
|
||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"express-session": "^1.18.0",
|
"express-session": "^1.18.0",
|
||||||
"form-data": "^4.0.3",
|
"form-data": "^4.0.3",
|
||||||
"fs-extra": "^11.2.0",
|
"fs-extra": "^11.2.0",
|
||||||
|
"mammoth": "^1.9.1",
|
||||||
"multer": "^2.0.0",
|
"multer": "^2.0.0",
|
||||||
|
"pdf-parse": "^1.1.1",
|
||||||
"uuid": "^10.0.0"
|
"uuid": "^10.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|||||||
206
server.js
206
server.js
@@ -13,6 +13,116 @@ require('dotenv').config();
|
|||||||
|
|
||||||
// Document processing utilities
|
// Document processing utilities
|
||||||
const crypto = require('crypto');
|
const crypto = require('crypto');
|
||||||
|
const mammoth = require('mammoth'); // For .docx files
|
||||||
|
const pdfParse = require('pdf-parse'); // For PDF files
|
||||||
|
const ExcelJS = require('exceljs'); // For Excel files
|
||||||
|
|
||||||
|
// Helper function to extract text from various document formats
|
||||||
|
async function extractTextFromDocument(filePath, fileExtension) {
|
||||||
|
try {
|
||||||
|
const extension = fileExtension.toLowerCase();
|
||||||
|
|
||||||
|
switch (extension) {
|
||||||
|
case '.docx':
|
||||||
|
// Extract text from Word documents
|
||||||
|
const docxBuffer = await fs.readFile(filePath);
|
||||||
|
const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
text: docxResult.value,
|
||||||
|
extractedLength: docxResult.value.length,
|
||||||
|
method: 'mammoth'
|
||||||
|
};
|
||||||
|
|
||||||
|
case '.pdf':
|
||||||
|
// Extract text from PDF documents
|
||||||
|
const pdfBuffer = await fs.readFile(filePath);
|
||||||
|
const pdfResult = await pdfParse(pdfBuffer);
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
text: pdfResult.text,
|
||||||
|
extractedLength: pdfResult.text.length,
|
||||||
|
method: 'pdf-parse',
|
||||||
|
pages: pdfResult.numpages
|
||||||
|
};
|
||||||
|
|
||||||
|
case '.xlsx':
|
||||||
|
case '.xls':
|
||||||
|
// Extract text from Excel files using ExcelJS
|
||||||
|
const workbook = new ExcelJS.Workbook();
|
||||||
|
await workbook.xlsx.readFile(filePath);
|
||||||
|
let excelText = '';
|
||||||
|
let sheetCount = 0;
|
||||||
|
|
||||||
|
workbook.eachSheet((worksheet, sheetId) => {
|
||||||
|
sheetCount++;
|
||||||
|
excelText += `Sheet: ${worksheet.name}\n`;
|
||||||
|
|
||||||
|
worksheet.eachRow((row, rowNumber) => {
|
||||||
|
const rowText = [];
|
||||||
|
row.eachCell((cell, colNumber) => {
|
||||||
|
// Extract cell value as text
|
||||||
|
let cellValue = cell.value;
|
||||||
|
if (cellValue !== null && cellValue !== undefined) {
|
||||||
|
// Handle different cell types
|
||||||
|
if (typeof cellValue === 'object' && cellValue.text) {
|
||||||
|
// Rich text object
|
||||||
|
cellValue = cellValue.text;
|
||||||
|
} else if (typeof cellValue === 'object' && cellValue.result) {
|
||||||
|
// Formula result
|
||||||
|
cellValue = cellValue.result;
|
||||||
|
}
|
||||||
|
rowText.push(String(cellValue));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (rowText.length > 0) {
|
||||||
|
excelText += rowText.join('\t') + '\n';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
excelText += '\n';
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
text: excelText,
|
||||||
|
extractedLength: excelText.length,
|
||||||
|
method: 'exceljs',
|
||||||
|
sheets: sheetCount
|
||||||
|
};
|
||||||
|
|
||||||
|
case '.txt':
|
||||||
|
case '.md':
|
||||||
|
case '.json':
|
||||||
|
case '.js':
|
||||||
|
case '.html':
|
||||||
|
case '.css':
|
||||||
|
case '.xml':
|
||||||
|
case '.csv':
|
||||||
|
// Read text files directly
|
||||||
|
const textContent = await fs.readFile(filePath, 'utf-8');
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
text: textContent,
|
||||||
|
extractedLength: textContent.length,
|
||||||
|
method: 'direct'
|
||||||
|
};
|
||||||
|
|
||||||
|
default:
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `Unsupported file type: ${extension}`,
|
||||||
|
text: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error extracting text from document:', error);
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: error.message,
|
||||||
|
text: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Document chunking configuration
|
// Document chunking configuration
|
||||||
const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
|
const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
|
||||||
@@ -1017,20 +1127,94 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
|
|||||||
return res.status(404).json({ success: false, error: 'File not found' });
|
return res.status(404).json({ success: false, error: 'File not found' });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read file content
|
|
||||||
const filePath = path.join(__dirname, file.path);
|
const filePath = path.join(__dirname, file.path);
|
||||||
const fileContent = await fs.readFile(filePath, 'utf-8');
|
const fileExtension = path.extname(file.originalName).toLowerCase();
|
||||||
|
|
||||||
res.json({
|
// Try to extract text from the document
|
||||||
success: true,
|
const extractionResult = await extractTextFromDocument(filePath, fileExtension);
|
||||||
file: {
|
|
||||||
id: file.id,
|
if (extractionResult.success) {
|
||||||
originalName: file.originalName,
|
// Successfully extracted text
|
||||||
size: file.size,
|
const extractedText = extractionResult.text;
|
||||||
uploadDate: file.uploadDate,
|
|
||||||
content: fileContent
|
// Limit preview to first 5000 characters to avoid huge responses
|
||||||
|
const previewContent = extractedText.length > 5000 ?
|
||||||
|
extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' :
|
||||||
|
extractedText;
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
file: {
|
||||||
|
id: file.id,
|
||||||
|
originalName: file.originalName,
|
||||||
|
size: file.size,
|
||||||
|
uploadDate: file.uploadDate,
|
||||||
|
content: previewContent,
|
||||||
|
previewType: 'extracted-text',
|
||||||
|
extractionInfo: {
|
||||||
|
method: extractionResult.method,
|
||||||
|
totalLength: extractionResult.extractedLength,
|
||||||
|
pages: extractionResult.pages || null,
|
||||||
|
sheets: extractionResult.sheets || null,
|
||||||
|
truncated: extractedText.length > 5000
|
||||||
|
},
|
||||||
|
message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// Failed to extract text, fall back to file type detection
|
||||||
|
const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
|
||||||
|
const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
|
||||||
|
|
||||||
|
if (textFormats.includes(fileExtension)) {
|
||||||
|
// Try reading as plain text (should have been handled by extraction, but fallback)
|
||||||
|
try {
|
||||||
|
const fileContent = await fs.readFile(filePath, 'utf-8');
|
||||||
|
const previewContent = fileContent.length > 5000 ?
|
||||||
|
fileContent.substring(0, 5000) + '\n\n... (truncated)' :
|
||||||
|
fileContent;
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
file: {
|
||||||
|
id: file.id,
|
||||||
|
originalName: file.originalName,
|
||||||
|
size: file.size,
|
||||||
|
uploadDate: file.uploadDate,
|
||||||
|
content: previewContent,
|
||||||
|
previewType: 'text'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (readError) {
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
file: {
|
||||||
|
id: file.id,
|
||||||
|
originalName: file.originalName,
|
||||||
|
size: file.size,
|
||||||
|
uploadDate: file.uploadDate,
|
||||||
|
content: 'File preview not available',
|
||||||
|
previewType: 'error',
|
||||||
|
message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Binary format that couldn't be processed
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
file: {
|
||||||
|
id: file.id,
|
||||||
|
originalName: file.originalName,
|
||||||
|
size: file.size,
|
||||||
|
uploadDate: file.uploadDate,
|
||||||
|
content: 'Text extraction failed',
|
||||||
|
previewType: 'extraction-failed',
|
||||||
|
message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error previewing file:', error);
|
console.error('Error previewing file:', error);
|
||||||
res.status(500).json({
|
res.status(500).json({
|
||||||
|
|||||||
@@ -181,10 +181,8 @@ function previewFile(fileId) {
|
|||||||
const file = result.file;
|
const file = result.file;
|
||||||
let content = '';
|
let content = '';
|
||||||
|
|
||||||
// Check file type and format content accordingly
|
// Handle different preview types
|
||||||
const fileExtension = file.originalName.split('.').pop().toLowerCase();
|
if (file.previewType === 'text') {
|
||||||
|
|
||||||
if (['txt', 'md', 'json', 'js', 'html', 'css', 'py', 'java', 'cpp', 'c'].includes(fileExtension)) {
|
|
||||||
// Text-based files - show with syntax highlighting
|
// Text-based files - show with syntax highlighting
|
||||||
content = `
|
content = `
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
@@ -198,8 +196,35 @@ function previewFile(fileId) {
|
|||||||
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;"><code>${escapeHtml(file.content)}</code></pre>
|
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;"><code>${escapeHtml(file.content)}</code></pre>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
} else if (['pdf', 'doc', 'docx'].includes(fileExtension)) {
|
} else if (file.previewType === 'extracted-text') {
|
||||||
// Document files - show basic info and content preview
|
// Successfully extracted text from document
|
||||||
|
const extractionInfo = file.extractionInfo || {};
|
||||||
|
const infoText = [];
|
||||||
|
|
||||||
|
if (extractionInfo.pages) infoText.push(`${extractionInfo.pages} pages`);
|
||||||
|
if (extractionInfo.sheets) infoText.push(`${extractionInfo.sheets} sheets`);
|
||||||
|
if (extractionInfo.totalLength) infoText.push(`${extractionInfo.totalLength} characters extracted`);
|
||||||
|
|
||||||
|
content = `
|
||||||
|
<div class="mb-3">
|
||||||
|
<h6><i class="fas fa-file-word me-2"></i>${file.originalName}</h6>
|
||||||
|
<small class="text-muted">
|
||||||
|
Size: ${Math.round(file.size / 1024)} KB |
|
||||||
|
Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
|
||||||
|
${infoText.length > 0 ? ' | ' + infoText.join(', ') : ''}
|
||||||
|
</small>
|
||||||
|
</div>
|
||||||
|
<div class="alert alert-success">
|
||||||
|
<i class="fas fa-check-circle me-2"></i>
|
||||||
|
${file.message || 'Text successfully extracted from document'}
|
||||||
|
</div>
|
||||||
|
<div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
|
||||||
|
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
|
||||||
|
</div>
|
||||||
|
${extractionInfo.truncated ? '<small class="text-muted mt-2 d-block"><i class="fas fa-info-circle me-1"></i>Full document content is available for AI processing</small>' : ''}
|
||||||
|
`;
|
||||||
|
} else if (file.previewType === 'binary') {
|
||||||
|
// Binary files - show info message
|
||||||
content = `
|
content = `
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
<h6><i class="fas fa-file-pdf me-2"></i>${file.originalName}</h6>
|
<h6><i class="fas fa-file-pdf me-2"></i>${file.originalName}</h6>
|
||||||
@@ -210,14 +235,47 @@ function previewFile(fileId) {
|
|||||||
</div>
|
</div>
|
||||||
<div class="alert alert-info">
|
<div class="alert alert-info">
|
||||||
<i class="fas fa-info-circle me-2"></i>
|
<i class="fas fa-info-circle me-2"></i>
|
||||||
Document preview: First few lines of extracted text
|
${file.message || 'This is a binary file that has been processed for AI use.'}
|
||||||
</div>
|
</div>
|
||||||
<div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
|
${file.content && file.content !== 'File preview not available' ? `
|
||||||
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content.substring(0, 1000))}${file.content.length > 1000 ? '...' : ''}</pre>
|
<div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
|
||||||
|
<small class="text-muted">Extracted content preview:</small>
|
||||||
|
<pre style="margin: 0; margin-top: 10px; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
|
||||||
|
</div>
|
||||||
|
` : ''}
|
||||||
|
`;
|
||||||
|
} else if (file.previewType === 'extraction-failed') {
|
||||||
|
// Failed to extract text
|
||||||
|
content = `
|
||||||
|
<div class="mb-3">
|
||||||
|
<h6><i class="fas fa-file-exclamation me-2"></i>${file.originalName}</h6>
|
||||||
|
<small class="text-muted">
|
||||||
|
Size: ${Math.round(file.size / 1024)} KB |
|
||||||
|
Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
|
||||||
|
</small>
|
||||||
|
</div>
|
||||||
|
<div class="alert alert-warning">
|
||||||
|
<i class="fas fa-exclamation-triangle me-2"></i>
|
||||||
|
${file.message || 'Could not extract text from this file.'}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else if (file.previewType === 'error') {
|
||||||
|
// Error reading file
|
||||||
|
content = `
|
||||||
|
<div class="mb-3">
|
||||||
|
<h6><i class="fas fa-file-times me-2"></i>${file.originalName}</h6>
|
||||||
|
<small class="text-muted">
|
||||||
|
Size: ${Math.round(file.size / 1024)} KB |
|
||||||
|
Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
|
||||||
|
</small>
|
||||||
|
</div>
|
||||||
|
<div class="alert alert-danger">
|
||||||
|
<i class="fas fa-exclamation-circle me-2"></i>
|
||||||
|
${file.message || 'Error reading file.'}
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
} else {
|
} else {
|
||||||
// Other files - show basic info
|
// Fallback for unknown preview types
|
||||||
content = `
|
content = `
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
<h6><i class="fas fa-file me-2"></i>${file.originalName}</h6>
|
<h6><i class="fas fa-file me-2"></i>${file.originalName}</h6>
|
||||||
|
|||||||
Reference in New Issue
Block a user