Improve text extraction in dashboard
This commit is contained in:
928
package-lock.json
generated
928
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -15,11 +15,14 @@
|
||||
"cors": "^2.8.5",
|
||||
"dotenv": "^16.4.5",
|
||||
"ejs": "^3.1.10",
|
||||
"exceljs": "^4.4.0",
|
||||
"express": "^4.19.2",
|
||||
"express-session": "^1.18.0",
|
||||
"form-data": "^4.0.3",
|
||||
"fs-extra": "^11.2.0",
|
||||
"mammoth": "^1.9.1",
|
||||
"multer": "^2.0.0",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"uuid": "^10.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
||||
190
server.js
190
server.js
@@ -13,6 +13,116 @@ require('dotenv').config();
|
||||
|
||||
// Document processing utilities
|
||||
const crypto = require('crypto');
|
||||
const mammoth = require('mammoth'); // For .docx files
|
||||
const pdfParse = require('pdf-parse'); // For PDF files
|
||||
const ExcelJS = require('exceljs'); // For Excel files
|
||||
|
||||
// Helper function to extract text from various document formats
|
||||
async function extractTextFromDocument(filePath, fileExtension) {
|
||||
try {
|
||||
const extension = fileExtension.toLowerCase();
|
||||
|
||||
switch (extension) {
|
||||
case '.docx':
|
||||
// Extract text from Word documents
|
||||
const docxBuffer = await fs.readFile(filePath);
|
||||
const docxResult = await mammoth.extractRawText({ buffer: docxBuffer });
|
||||
return {
|
||||
success: true,
|
||||
text: docxResult.value,
|
||||
extractedLength: docxResult.value.length,
|
||||
method: 'mammoth'
|
||||
};
|
||||
|
||||
case '.pdf':
|
||||
// Extract text from PDF documents
|
||||
const pdfBuffer = await fs.readFile(filePath);
|
||||
const pdfResult = await pdfParse(pdfBuffer);
|
||||
return {
|
||||
success: true,
|
||||
text: pdfResult.text,
|
||||
extractedLength: pdfResult.text.length,
|
||||
method: 'pdf-parse',
|
||||
pages: pdfResult.numpages
|
||||
};
|
||||
|
||||
case '.xlsx':
|
||||
case '.xls':
|
||||
// Extract text from Excel files using ExcelJS
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
await workbook.xlsx.readFile(filePath);
|
||||
let excelText = '';
|
||||
let sheetCount = 0;
|
||||
|
||||
workbook.eachSheet((worksheet, sheetId) => {
|
||||
sheetCount++;
|
||||
excelText += `Sheet: ${worksheet.name}\n`;
|
||||
|
||||
worksheet.eachRow((row, rowNumber) => {
|
||||
const rowText = [];
|
||||
row.eachCell((cell, colNumber) => {
|
||||
// Extract cell value as text
|
||||
let cellValue = cell.value;
|
||||
if (cellValue !== null && cellValue !== undefined) {
|
||||
// Handle different cell types
|
||||
if (typeof cellValue === 'object' && cellValue.text) {
|
||||
// Rich text object
|
||||
cellValue = cellValue.text;
|
||||
} else if (typeof cellValue === 'object' && cellValue.result) {
|
||||
// Formula result
|
||||
cellValue = cellValue.result;
|
||||
}
|
||||
rowText.push(String(cellValue));
|
||||
}
|
||||
});
|
||||
if (rowText.length > 0) {
|
||||
excelText += rowText.join('\t') + '\n';
|
||||
}
|
||||
});
|
||||
excelText += '\n';
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
text: excelText,
|
||||
extractedLength: excelText.length,
|
||||
method: 'exceljs',
|
||||
sheets: sheetCount
|
||||
};
|
||||
|
||||
case '.txt':
|
||||
case '.md':
|
||||
case '.json':
|
||||
case '.js':
|
||||
case '.html':
|
||||
case '.css':
|
||||
case '.xml':
|
||||
case '.csv':
|
||||
// Read text files directly
|
||||
const textContent = await fs.readFile(filePath, 'utf-8');
|
||||
return {
|
||||
success: true,
|
||||
text: textContent,
|
||||
extractedLength: textContent.length,
|
||||
method: 'direct'
|
||||
};
|
||||
|
||||
default:
|
||||
return {
|
||||
success: false,
|
||||
error: `Unsupported file type: ${extension}`,
|
||||
text: null
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error extracting text from document:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
text: null
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Document chunking configuration
|
||||
const CHUNK_SIZE = 20000; // Larger chunks for FormData uploads (20KB)
|
||||
@@ -1017,9 +1127,20 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
|
||||
return res.status(404).json({ success: false, error: 'File not found' });
|
||||
}
|
||||
|
||||
// Read file content
|
||||
const filePath = path.join(__dirname, file.path);
|
||||
const fileContent = await fs.readFile(filePath, 'utf-8');
|
||||
const fileExtension = path.extname(file.originalName).toLowerCase();
|
||||
|
||||
// Try to extract text from the document
|
||||
const extractionResult = await extractTextFromDocument(filePath, fileExtension);
|
||||
|
||||
if (extractionResult.success) {
|
||||
// Successfully extracted text
|
||||
const extractedText = extractionResult.text;
|
||||
|
||||
// Limit preview to first 5000 characters to avoid huge responses
|
||||
const previewContent = extractedText.length > 5000 ?
|
||||
extractedText.substring(0, 5000) + '\n\n... (content truncated for preview)' :
|
||||
extractedText;
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
@@ -1028,9 +1149,72 @@ app.get('/api/files/:fileId/preview', requireAuth, async (req, res) => {
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: fileContent
|
||||
content: previewContent,
|
||||
previewType: 'extracted-text',
|
||||
extractionInfo: {
|
||||
method: extractionResult.method,
|
||||
totalLength: extractionResult.extractedLength,
|
||||
pages: extractionResult.pages || null,
|
||||
sheets: extractionResult.sheets || null,
|
||||
truncated: extractedText.length > 5000
|
||||
},
|
||||
message: `Text successfully extracted from ${fileExtension.toUpperCase()} file. ${extractedText.length > 5000 ? 'Preview truncated to first 5000 characters.' : ''}`
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// Failed to extract text, fall back to file type detection
|
||||
const textFormats = ['.txt', '.md', '.json', '.js', '.html', '.css', '.xml', '.csv'];
|
||||
const binaryFormats = ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'];
|
||||
|
||||
if (textFormats.includes(fileExtension)) {
|
||||
// Try reading as plain text (should have been handled by extraction, but fallback)
|
||||
try {
|
||||
const fileContent = await fs.readFile(filePath, 'utf-8');
|
||||
const previewContent = fileContent.length > 5000 ?
|
||||
fileContent.substring(0, 5000) + '\n\n... (truncated)' :
|
||||
fileContent;
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: previewContent,
|
||||
previewType: 'text'
|
||||
}
|
||||
});
|
||||
} catch (readError) {
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: 'File preview not available',
|
||||
previewType: 'error',
|
||||
message: `Error reading ${fileExtension.toUpperCase()} file: ${readError.message}`
|
||||
}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Binary format that couldn't be processed
|
||||
res.json({
|
||||
success: true,
|
||||
file: {
|
||||
id: file.id,
|
||||
originalName: file.originalName,
|
||||
size: file.size,
|
||||
uploadDate: file.uploadDate,
|
||||
content: 'Text extraction failed',
|
||||
previewType: 'extraction-failed',
|
||||
message: `Failed to extract text from ${fileExtension.toUpperCase()} file: ${extractionResult.error}. The file has been uploaded and may still be usable for AI processing.`
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error previewing file:', error);
|
||||
res.status(500).json({
|
||||
|
||||
@@ -181,10 +181,8 @@ function previewFile(fileId) {
|
||||
const file = result.file;
|
||||
let content = '';
|
||||
|
||||
// Check file type and format content accordingly
|
||||
const fileExtension = file.originalName.split('.').pop().toLowerCase();
|
||||
|
||||
if (['txt', 'md', 'json', 'js', 'html', 'css', 'py', 'java', 'cpp', 'c'].includes(fileExtension)) {
|
||||
// Handle different preview types
|
||||
if (file.previewType === 'text') {
|
||||
// Text-based files - show with syntax highlighting
|
||||
content = `
|
||||
<div class="mb-3">
|
||||
@@ -198,8 +196,35 @@ function previewFile(fileId) {
|
||||
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;"><code>${escapeHtml(file.content)}</code></pre>
|
||||
</div>
|
||||
`;
|
||||
} else if (['pdf', 'doc', 'docx'].includes(fileExtension)) {
|
||||
// Document files - show basic info and content preview
|
||||
} else if (file.previewType === 'extracted-text') {
|
||||
// Successfully extracted text from document
|
||||
const extractionInfo = file.extractionInfo || {};
|
||||
const infoText = [];
|
||||
|
||||
if (extractionInfo.pages) infoText.push(`${extractionInfo.pages} pages`);
|
||||
if (extractionInfo.sheets) infoText.push(`${extractionInfo.sheets} sheets`);
|
||||
if (extractionInfo.totalLength) infoText.push(`${extractionInfo.totalLength} characters extracted`);
|
||||
|
||||
content = `
|
||||
<div class="mb-3">
|
||||
<h6><i class="fas fa-file-word me-2"></i>${file.originalName}</h6>
|
||||
<small class="text-muted">
|
||||
Size: ${Math.round(file.size / 1024)} KB |
|
||||
Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
|
||||
${infoText.length > 0 ? ' | ' + infoText.join(', ') : ''}
|
||||
</small>
|
||||
</div>
|
||||
<div class="alert alert-success">
|
||||
<i class="fas fa-check-circle me-2"></i>
|
||||
${file.message || 'Text successfully extracted from document'}
|
||||
</div>
|
||||
<div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
|
||||
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
|
||||
</div>
|
||||
${extractionInfo.truncated ? '<small class="text-muted mt-2 d-block"><i class="fas fa-info-circle me-1"></i>Full document content is available for AI processing</small>' : ''}
|
||||
`;
|
||||
} else if (file.previewType === 'binary') {
|
||||
// Binary files - show info message
|
||||
content = `
|
||||
<div class="mb-3">
|
||||
<h6><i class="fas fa-file-pdf me-2"></i>${file.originalName}</h6>
|
||||
@@ -210,14 +235,47 @@ function previewFile(fileId) {
|
||||
</div>
|
||||
<div class="alert alert-info">
|
||||
<i class="fas fa-info-circle me-2"></i>
|
||||
Document preview: First few lines of extracted text
|
||||
${file.message || 'This is a binary file that has been processed for AI use.'}
|
||||
</div>
|
||||
${file.content && file.content !== 'File preview not available' ? `
|
||||
<div class="border rounded p-3" style="background-color: #f8f9fa; max-height: 400px; overflow-y: auto;">
|
||||
<pre style="margin: 0; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content.substring(0, 1000))}${file.content.length > 1000 ? '...' : ''}</pre>
|
||||
<small class="text-muted">Extracted content preview:</small>
|
||||
<pre style="margin: 0; margin-top: 10px; white-space: pre-wrap; word-wrap: break-word;">${escapeHtml(file.content)}</pre>
|
||||
</div>
|
||||
` : ''}
|
||||
`;
|
||||
} else if (file.previewType === 'extraction-failed') {
|
||||
// Failed to extract text
|
||||
content = `
|
||||
<div class="mb-3">
|
||||
<h6><i class="fas fa-file-exclamation me-2"></i>${file.originalName}</h6>
|
||||
<small class="text-muted">
|
||||
Size: ${Math.round(file.size / 1024)} KB |
|
||||
Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
|
||||
</small>
|
||||
</div>
|
||||
<div class="alert alert-warning">
|
||||
<i class="fas fa-exclamation-triangle me-2"></i>
|
||||
${file.message || 'Could not extract text from this file.'}
|
||||
</div>
|
||||
`;
|
||||
} else if (file.previewType === 'error') {
|
||||
// Error reading file
|
||||
content = `
|
||||
<div class="mb-3">
|
||||
<h6><i class="fas fa-file-times me-2"></i>${file.originalName}</h6>
|
||||
<small class="text-muted">
|
||||
Size: ${Math.round(file.size / 1024)} KB |
|
||||
Uploaded: ${new Date(file.uploadDate).toLocaleDateString()}
|
||||
</small>
|
||||
</div>
|
||||
<div class="alert alert-danger">
|
||||
<i class="fas fa-exclamation-circle me-2"></i>
|
||||
${file.message || 'Error reading file.'}
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
// Other files - show basic info
|
||||
// Fallback for unknown preview types
|
||||
content = `
|
||||
<div class="mb-3">
|
||||
<h6><i class="fas fa-file me-2"></i>${file.originalName}</h6>
|
||||
|
||||
Reference in New Issue
Block a user