Initial commit with Email Import Wizard and Task Processor updates

This commit is contained in:
2026-05-04 08:02:11 +02:00
commit effdc5d59f
170 changed files with 67739 additions and 0 deletions
@@ -0,0 +1,118 @@
import { Injectable, Logger } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm';
import { Repository } from 'typeorm';
import { ConfigService } from '@nestjs/config';
import * as fs from 'fs/promises';
import * as path from 'path';
import { v4 as uuidv4 } from 'uuid';
import { Task } from '../database/entities/task.entity';
import { PdfService } from './pdf.service';
import { QrCodeService } from './qr-code.service';
import { OcrService } from './ocr.service';
@Injectable()
export class DocumentPipelineService {
private readonly logger = new Logger(DocumentPipelineService.name);
private readonly archiveDir: string;
constructor(
@InjectRepository(Task) private readonly taskRepo: Repository<Task>,
private readonly pdfService: PdfService,
private readonly qrCodeService: QrCodeService,
private readonly ocrService: OcrService,
private readonly configService: ConfigService,
) {
this.archiveDir = this.configService.get<string>(
'SCANNER_ARCHIVE_DIR',
'/data/scanner/_processed_archive',
);
}
/**
* Verarbeitet ein neues Dokument:
* 1. PDF → Bilder
* 2. QR-Code-Erkennung auf Seite 1
* 3. OCR via Ollama Vision auf Seite 1
* 4. Task in DB erstellen (Inbox-Eintrag)
* 5. Original in Archiv verschieben (GoBD)
*/
async processDocument(filePath: string): Promise<Task> {
const taskId = uuidv4();
const fileName = path.basename(filePath);
this.logger.log(`Pipeline startet: ${fileName} (${taskId})`);
let images: string[] = [];
try {
// 1. PDF → Bild(er)
images = await this.pdfService.pdfToImages(filePath, 200);
this.logger.log(`${images.length} Seite(n) konvertiert`);
// 2. QR-Code auf erster Seite scannen
const firstPageBuffer = await fs.readFile(images[0]);
const qrResults = await this.qrCodeService.extractFromImage(firstPageBuffer);
let barcodeData: Record<string, any> | null = null;
if (qrResults.length > 0) {
barcodeData = this.qrCodeService.parseBarcode(qrResults[0].data);
if (barcodeData) {
this.logger.log(`QR-Code erkannt und validiert: ${JSON.stringify(barcodeData)}`);
}
}
// 3. OCR auf erster Seite
const ocrMarkdown = await this.ocrService.extractTextAsMarkdown(firstPageBuffer);
// 4. Task in DB erstellen
const year = new Date().getFullYear();
const lastTask = await this.taskRepo
.createQueryBuilder('t')
.where('t.InterneBelegnummer LIKE :prefix', { prefix: `${year}-%` })
.orderBy('t.InterneBelegnummer', 'DESC')
.getOne();
const nextNum = lastTask
? parseInt(lastTask.InterneBelegnummer.split('-')[1], 10) + 1
: 1;
const belegnummer = `${year}-${String(nextNum).padStart(6, '0')}`;
const task = this.taskRepo.create({
TaskId: taskId,
InterneBelegnummer: belegnummer,
Eingangsdatum: new Date(),
Fertig: 0,
BarcodeJson: barcodeData ? JSON.stringify(barcodeData) : null,
DocumentType: barcodeData?.DocumentType ?? null,
BetriebID: barcodeData?.BetriebID ?? null,
Lieferant: barcodeData?.Lieferant ?? null,
externeBelegnummer: barcodeData?.Nummer ?? null,
});
await this.taskRepo.save(task);
this.logger.log(`Task erstellt: ${belegnummer}`);
// 5. GoBD-Archivierung
await this.archiveFile(filePath);
return task;
} finally {
await this.pdfService.cleanup(images);
}
}
/**
* Verschiebt die Originaldatei ins Archiv (GoBD-konform).
*/
private async archiveFile(filePath: string): Promise<void> {
await fs.mkdir(this.archiveDir, { recursive: true });
const datePrefix = new Date().toISOString().slice(0, 10);
const fileName = path.basename(filePath);
const archivePath = path.join(this.archiveDir, `${datePrefix}_${fileName}`);
await fs.rename(filePath, archivePath);
this.logger.log(`Archiviert: ${archivePath}`);
}
}
@@ -0,0 +1,49 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import axios from 'axios';
@Injectable()
export class OcrService {
private readonly logger = new Logger(OcrService.name);
private readonly ollamaUrl: string;
private readonly ollamaModel: string;
constructor(private readonly configService: ConfigService) {
this.ollamaUrl = this.configService.get<string>('OLLAMA_URL', 'http://localhost:11434');
this.ollamaModel = this.configService.get<string>('OLLAMA_MODEL', 'llava');
}
/**
* Sendet ein Bild an Ollama Vision und erhält den Inhalt als Markdown.
*/
async extractTextAsMarkdown(imageBuffer: Buffer): Promise<string> {
const base64Image = imageBuffer.toString('base64');
const prompt = `Analysiere dieses Dokument und extrahiere den gesamten Text.
Gib den Text als sauberes Markdown zurück, behalte die Struktur bei (Überschriften, Tabellen, Listen).
Antworte nur mit dem extrahierten Markdown-Text, keine Erklärungen.`;
try {
const response = await axios.post(
`${this.ollamaUrl}/api/generate`,
{
model: this.ollamaModel,
prompt,
images: [base64Image],
stream: false,
options: {
temperature: 0.1,
},
},
{ timeout: 120000 },
);
const markdown = response.data.response?.trim() ?? '';
this.logger.log(`OCR abgeschlossen: ${markdown.length} Zeichen extrahiert`);
return markdown;
} catch (error: any) {
this.logger.error(`Ollama OCR fehlgeschlagen: ${error.message}`);
throw error;
}
}
}
@@ -0,0 +1,98 @@
import { Injectable, Logger } from '@nestjs/common';
import { execFile } from 'child_process';
import { promisify } from 'util';
import * as path from 'path';
import * as fs from 'fs/promises';
import * as os from 'os';
const execFileAsync = promisify(execFile);
@Injectable()
export class PdfService {
private readonly logger = new Logger(PdfService.name);
/**
* Konvertiert eine PDF-Seite in ein PNG-Bild via Ghostscript.
* Gibt den Pfad zum temporären Bild zurück.
*/
async pdfPageToImage(pdfPath: string, page = 1, dpi = 300): Promise<string> {
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'pdf-'));
const outputPath = path.join(tmpDir, `page-${page}.png`);
await execFileAsync('gs', [
'-dNOPAUSE',
'-dBATCH',
'-dSAFER',
'-sDEVICE=png16m',
`-dFirstPage=${page}`,
`-dLastPage=${page}`,
`-r${dpi}`,
`-sOutputFile=${outputPath}`,
pdfPath,
]);
this.logger.debug(`PDF Seite ${page} konvertiert: ${outputPath}`);
return outputPath;
}
/**
* Konvertiert alle Seiten einer PDF in Bilder.
*/
async pdfToImages(pdfPath: string, dpi = 200): Promise<string[]> {
const pageCount = await this.getPageCount(pdfPath);
const images: string[] = [];
for (let i = 1; i <= pageCount; i++) {
const imgPath = await this.pdfPageToImage(pdfPath, i, dpi);
images.push(imgPath);
}
return images;
}
/**
* Ermittelt die Seitenanzahl einer PDF via Ghostscript.
*/
async getPageCount(pdfPath: string): Promise<number> {
const { stdout } = await execFileAsync('gs', [
'-q',
'-dNODISPLAY',
'-dNOSAFER',
`-c`,
`(${pdfPath.replace(/\\/g, '/')}) (r) file runpdfbegin pdfpagecount = quit`,
]);
return parseInt(stdout.trim(), 10) || 1;
}
/**
* Bereinigt eine PDF (entschlüsselt sie ggf. wenn nur Owner-Passwort gesetzt ist)
* via Ghostscript pdfwrite.
*/
async sanitizePdf(inputPath: string): Promise<string> {
const outputPath = path.join(os.tmpdir(), `sanitized-${Date.now()}.pdf`);
await execFileAsync('gs', [
'-dNOPAUSE',
'-dBATCH',
'-dSAFER',
'-sDEVICE=pdfwrite',
`-sOutputFile=${outputPath}`,
inputPath,
]);
return outputPath;
}
/**
* Räumt temporäre Bilder auf.
*/
async cleanup(imagePaths: string[]): Promise<void> {
for (const imgPath of imagePaths) {
try {
await fs.unlink(imgPath);
const dir = path.dirname(imgPath);
await fs.rmdir(dir).catch(() => {});
} catch {
// Ignorieren
}
}
}
}
@@ -0,0 +1,14 @@
import { Module } from '@nestjs/common';
import { TypeOrmModule } from '@nestjs/typeorm';
import { Task } from '../database/entities/task.entity';
import { QrCodeService } from './qr-code.service';
import { OcrService } from './ocr.service';
import { PdfService } from './pdf.service';
import { DocumentPipelineService } from './document-pipeline.service';
@Module({
imports: [TypeOrmModule.forFeature([Task])],
providers: [QrCodeService, OcrService, PdfService, DocumentPipelineService],
exports: [QrCodeService, OcrService, PdfService, DocumentPipelineService],
})
export class PreprocessingModule {}
@@ -0,0 +1,102 @@
import { Injectable, Logger } from '@nestjs/common';
import sharp = require('sharp');
import jsQR from 'jsqr';
export interface QrCodeResult {
data: string;
location: {
x: number;
y: number;
width: number;
height: number;
};
}
@Injectable()
export class QrCodeService {
private readonly logger = new Logger(QrCodeService.name);
/**
* Extrahiert ALLE QR-Codes aus einem Bild-Buffer (PNG/JPEG).
* jsQR findet nur einen Code pro Aufruf — daher iteratives Vorgehen:
* Code finden → Bereich weiß überdecken → erneut scannen, bis nichts mehr gefunden wird.
*/
async extractFromImage(imageBuffer: Buffer): Promise<QrCodeResult[]> {
const results: QrCodeResult[] = [];
const seen = new Set<string>();
let currentBuffer = imageBuffer;
const MAX_PASSES = 10;
for (let pass = 0; pass < MAX_PASSES; pass++) {
const { data, info } = await sharp(currentBuffer)
.ensureAlpha()
.raw()
.toBuffer({ resolveWithObject: true });
const imageData = new Uint8ClampedArray(data.buffer);
const code = jsQR(imageData, info.width, info.height, {
inversionAttempts: 'attemptBoth',
});
if (!code) break;
const corners = [
code.location.topLeftCorner,
code.location.topRightCorner,
code.location.bottomLeftCorner,
code.location.bottomRightCorner,
];
const xs = corners.map((c) => c.x);
const ys = corners.map((c) => c.y);
const minX = Math.floor(Math.min(...xs));
const minY = Math.floor(Math.min(...ys));
const maxX = Math.ceil(Math.max(...xs));
const maxY = Math.ceil(Math.max(...ys));
const width = Math.max(1, maxX - minX);
const height = Math.max(1, maxY - minY);
if (!seen.has(code.data)) {
seen.add(code.data);
results.push({
data: code.data,
location: { x: minX, y: minY, width, height },
});
this.logger.debug(`QR-Code erkannt (Pass ${pass + 1}): ${code.data}`);
}
// Erkannten Bereich mit weißem Rechteck (inkl. Padding) überdecken,
// damit jsQR im nächsten Pass den nächsten QR findet.
const pad = 12;
const maskX = Math.max(0, minX - pad);
const maskY = Math.max(0, minY - pad);
const maskW = Math.min(info.width - maskX, width + 2 * pad);
const maskH = Math.min(info.height - maskY, height + 2 * pad);
const svg = `<svg width="${info.width}" height="${info.height}" xmlns="http://www.w3.org/2000/svg"><rect x="${maskX}" y="${maskY}" width="${maskW}" height="${maskH}" fill="white"/></svg>`;
currentBuffer = await sharp(currentBuffer)
.composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
.png()
.toBuffer();
}
return results;
}
/**
* Validiert ob der QR-Code-Inhalt dem erwarteten Schema entspricht.
* Schema: JSON mit X, Y, Jahr, Nummer, Eingangsdatum
*/
parseBarcode(qrData: string): Record<string, any> | null {
try {
const parsed = JSON.parse(qrData);
if (parsed.Jahr !== undefined && parsed.Nummer !== undefined) {
return parsed;
}
this.logger.warn(`QR-Code-Daten passen nicht zum Schema: ${qrData}`);
return null;
} catch {
this.logger.debug(`QR-Code ist kein JSON: ${qrData}`);
return null;
}
}
}