<?php

class PdfOcrHelper
{
    public static function extractOcr(string $absFile): ?string
    {
        if (!file_exists($absFile)) {
            return null;
        }

        $tmpDir = sys_get_temp_dir() . '/ocr_' . md5($absFile);
        if (!is_dir($tmpDir)) {
            mkdir($tmpDir, 0777, true);
        }

        // ----- PDF → PNG (max 3 Seiten zur Sicherheit!) -----
        $cmd = "pdftoppm -png -f 1 -l 3 " . escapeshellarg($absFile) . " " . escapeshellarg($tmpDir . '/page');
        exec($cmd, $out1, $code1);

        if ($code1 !== 0) {
            error_log("OCR ERROR: pdftoppm failed ($code1) on ".$absFile);
            return null;
        }

        $pngFiles = glob($tmpDir . '/*.png');
        if (!$pngFiles || count($pngFiles) === 0) {
            error_log("OCR WARN: no PNG pages generated for ".$absFile);
            return null;
        }

        $fullText = "";

        foreach ($pngFiles as $png) {

            // falls PNG leer oder beschädigt
            if (filesize($png) < 2000) {
                @unlink($png);
                continue;
            }

            $outputBase = $tmpDir . '/' . basename($png) . "_ocr";

            // ----- OCR durchführen (Deutsch) -----
            $cmd = "tesseract " . escapeshellarg($png) . " " . escapeshellarg($outputBase) . " -l deu 2>/dev/null";
            exec($cmd, $out2, $code2);

            // falls Tesseract crashed → abbrechen aber nicht 500 erzeugen
            if ($code2 !== 0) {
                error_log("OCR ERROR: tesseract failed ($code2) on ".$png);
                @unlink($png);
                continue;
            }

            $txtFile = $outputBase . ".txt";
            if (file_exists($txtFile)) {
                $pageText = file_get_contents($txtFile);
                if ($pageText && trim($pageText) !== '') {
                    $fullText .= "\n" . $pageText;
                }
                @unlink($txtFile);
            }

            @unlink($png);
        }

        @rmdir($tmpDir);

        return trim($fullText) !== '' ? trim($fullText) : null;
    }
}
