import { Page } from 'tesseract.js';
import { OCRResult, OCRResultLine, OCRResultWord } from '../models/OCR';

export const extractLines = (page: Page, minConfidence = 20): OCRResult => {
  const lines: OCRResultLine[] = [];

  page.lines.forEach(line => {
    const words: OCRResultWord[] = line.words
      .filter(word => word.confidence >= minConfidence || word.confidence === 0)
      .map(word => ({
        text: word.text,
        confidence: word.confidence,
        baseline: word.baseline,
        bbox: word.bbox,
      }));

    if (words.length > 0) {
      lines.push({
        baseline: line.baseline,
        bbox: line.bbox,
        text: words.map(word => word.text).join(' '),
        confidence: words.reduce((sum, word) => sum + (word.confidence || 0), 0) / words.length,
        words,
      });
    }
  });

  return {
    text: lines.map(line => line.text).join('\n'),
    confidence: lines.reduce((sum, line) => sum + (line.confidence || 0), 0) / lines.length,
    lines,
  };
};
