From fcca31bc4f6c4b1393c72ec2e17b1bea2adc2a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20P=C3=B6ttker?= Date: Mon, 4 May 2026 15:45:19 +0200 Subject: [PATCH] perf: optimize PDF to image conversion by using a single Ghostscript execution and improve cleanup logic --- .../src/preprocessing/pdf.service.ts | 42 ++++++++++++++----- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/paperless-backend/src/preprocessing/pdf.service.ts b/paperless-backend/src/preprocessing/pdf.service.ts index a95b2de..3da643d 100644 --- a/paperless-backend/src/preprocessing/pdf.service.ts +++ b/paperless-backend/src/preprocessing/pdf.service.ts @@ -37,16 +37,37 @@ export class PdfService { /** * Konvertiert alle Seiten einer PDF in Bilder. + * Verwendet einen einzigen Ghostscript-Aufruf mit %d-Platzhalter für alle Seiten. */ async pdfToImages(pdfPath: string, dpi = 200): Promise { const pageCount = await this.getPageCount(pdfPath); - const images: string[] = []; + if (pageCount === 0) return []; + const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'pdf-')); + const outputPattern = path.join(tmpDir, 'page-%d.png'); + + await execFileAsync('gs', [ + '-dNOPAUSE', + '-dBATCH', + '-dSAFER', + '-sDEVICE=png16m', + `-r${dpi}`, + `-sOutputFile=${outputPattern}`, + pdfPath, + ]); + + const images: string[] = []; for (let i = 1; i <= pageCount; i++) { - const imgPath = await this.pdfPageToImage(pdfPath, i, dpi); - images.push(imgPath); + const imgPath = path.join(tmpDir, `page-${i}.png`); + try { + await fs.access(imgPath); + images.push(imgPath); + } catch { + this.logger.warn(`Ghostscript hat Seite ${i} nicht erstellt: ${imgPath}`); + } } + this.logger.debug(`PDF konvertiert: ${images.length}/${pageCount} Seite(n) in ${tmpDir}`); return images; } @@ -82,17 +103,16 @@ export class PdfService { } /** - * Räumt temporäre Bilder auf. + * Räumt temporäre Bilder und ihre Verzeichnisse auf. */ async cleanup(imagePaths: string[]): Promise { + const dirs = new Set(); for (const imgPath of imagePaths) { - try { - await fs.unlink(imgPath); - const dir = path.dirname(imgPath); - await fs.rmdir(dir).catch(() => {}); - } catch { - // Ignorieren - } + try { await fs.unlink(imgPath); } catch {} + dirs.add(path.dirname(imgPath)); + } + for (const dir of dirs) { + await fs.rmdir(dir).catch(() => {}); } } }