Files
paperlessmanager/paperless-backend/src/preprocessing/document-pipeline.service.ts
T

119 lines
3.9 KiB
TypeScript

import { Injectable, Logger } from '@nestjs/common';
import { InjectRepository } from '@nestjs/typeorm';
import { Repository } from 'typeorm';
import { ConfigService } from '@nestjs/config';
import * as fs from 'fs/promises';
import * as path from 'path';
import { v4 as uuidv4 } from 'uuid';
import { Task } from '../database/entities/task.entity';
import { PdfService } from './pdf.service';
import { QrCodeService } from './qr-code.service';
import { OcrService } from './ocr.service';
@Injectable()
export class DocumentPipelineService {
private readonly logger = new Logger(DocumentPipelineService.name);
private readonly archiveDir: string;
constructor(
@InjectRepository(Task) private readonly taskRepo: Repository<Task>,
private readonly pdfService: PdfService,
private readonly qrCodeService: QrCodeService,
private readonly ocrService: OcrService,
private readonly configService: ConfigService,
) {
this.archiveDir = this.configService.get<string>(
'SCANNER_ARCHIVE_DIR',
'/data/scanner/_processed_archive',
);
}
/**
* Verarbeitet ein neues Dokument:
* 1. PDF → Bilder
* 2. QR-Code-Erkennung auf Seite 1
* 3. OCR via Ollama Vision auf Seite 1
* 4. Task in DB erstellen (Inbox-Eintrag)
* 5. Original in Archiv verschieben (GoBD)
*/
async processDocument(filePath: string): Promise<Task> {
const taskId = uuidv4();
const fileName = path.basename(filePath);
this.logger.log(`Pipeline startet: ${fileName} (${taskId})`);
let images: string[] = [];
try {
// 1. PDF → Bild(er)
images = await this.pdfService.pdfToImages(filePath, 200);
this.logger.log(`${images.length} Seite(n) konvertiert`);
// 2. QR-Code auf erster Seite scannen
const firstPageBuffer = await fs.readFile(images[0]);
const qrResults = await this.qrCodeService.extractFromImage(firstPageBuffer);
let barcodeData: Record<string, any> | null = null;
if (qrResults.length > 0) {
barcodeData = this.qrCodeService.parseBarcode(qrResults[0].data);
if (barcodeData) {
this.logger.log(`QR-Code erkannt und validiert: ${JSON.stringify(barcodeData)}`);
}
}
// 3. OCR auf erster Seite
const ocrMarkdown = await this.ocrService.extractTextAsMarkdown(firstPageBuffer);
// 4. Task in DB erstellen
const year = new Date().getFullYear();
const lastTask = await this.taskRepo
.createQueryBuilder('t')
.where('t.InterneBelegnummer LIKE :prefix', { prefix: `${year}-%` })
.orderBy('t.InterneBelegnummer', 'DESC')
.getOne();
const nextNum = lastTask
? parseInt(lastTask.InterneBelegnummer.split('-')[1], 10) + 1
: 1;
const belegnummer = `${year}-${String(nextNum).padStart(6, '0')}`;
const task = this.taskRepo.create({
TaskId: taskId,
InterneBelegnummer: belegnummer,
Eingangsdatum: new Date(),
Fertig: 0,
BarcodeJson: barcodeData ? JSON.stringify(barcodeData) : null,
DocumentType: barcodeData?.DocumentType ?? null,
BetriebID: barcodeData?.BetriebID ?? null,
Lieferant: barcodeData?.Lieferant ?? null,
externeBelegnummer: barcodeData?.Nummer ?? null,
});
await this.taskRepo.save(task);
this.logger.log(`Task erstellt: ${belegnummer}`);
// 5. GoBD-Archivierung
await this.archiveFile(filePath);
return task;
} finally {
await this.pdfService.cleanup(images);
}
}
/**
* Verschiebt die Originaldatei ins Archiv (GoBD-konform).
*/
private async archiveFile(filePath: string): Promise<void> {
await fs.mkdir(this.archiveDir, { recursive: true });
const datePrefix = new Date().toISOString().slice(0, 10);
const fileName = path.basename(filePath);
const archivePath = path.join(this.archiveDir, `${datePrefix}_${fileName}`);
await fs.rename(filePath, archivePath);
this.logger.log(`Archiviert: ${archivePath}`);
}
}