Source code for pororo.tasks.optical_character_recognition

"""OCR related modeling class"""

from typing import Optional

from pororo.tasks import download_or_load
from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase


[docs]class PororoOcrFactory(PororoFactoryBase):
    """
    Recognize optical characters in image file
    Currently support Korean language

    English + Korean (`brainocr`)

        - dataset: Internal data + AI hub Font Image dataset
        - metric: TBU
        - ref: https://www.aihub.or.kr/aidata/133

    Examples:
        >>> ocr = Pororo(task="ocr", lang="ko")
        >>> ocr(IMAGE_PATH)
        ["사이렌'(' 신마'", "내가 말했잖아 속지열라고 이 손을 잡는 너는 위협해질 거라고"]

        >>> ocr = Pororo(task="ocr", lang="ko")
        >>> ocr(IMAGE_PATH, detail=True)
        {
            'description': ["사이렌'(' 신마', "내가 말했잖아 속지열라고 이 손을 잡는 너는 위협해질 거라고"],
            'bounding_poly': [
                {
                    'description': "사이렌'(' 신마'",
                    'vertices': [
                        {'x': 93, 'y': 7},
                        {'x': 164, 'y': 7},
                        {'x': 164, 'y': 21},
                        {'x': 93, 'y': 21}
                    ]
                },
                {
                    'description': "내가 말했잖아 속지열라고 이 손을 잡는 너는 위협해질 거라고",
                    'vertices': [
                        {'x': 0, 'y': 30},
                        {'x': 259, 'y': 30},
                        {'x': 259, 'y': 194},
                        {'x': 0, 'y': 194}]}
                    ]
                }
        }
    """

    def __init__(self, task: str, lang: str, model: Optional[str]):
        super().__init__(task, lang, model)
        self.detect_model = "craft"
        self.ocr_opt = "ocr-opt"

[docs]    @staticmethod
    def get_available_langs():
        return ["en", "ko"]

[docs]    @staticmethod
    def get_available_models():
        return {
            "en": ["brainocr"],
            "ko": ["brainocr"],
        }

[docs]    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "brainocr":
            from pororo.models.brainOCR import brainocr

            if self.config.lang not in self.get_available_langs():
                raise ValueError(
                    f"Unsupported Language : {self.config.lang}",
                    'Support Languages : ["en", "ko"]',
                )

            det_model_path = download_or_load(
                f"misc/{self.detect_model}.pt",
                self.config.lang,
            )
            rec_model_path = download_or_load(
                f"misc/{self.config.n_model}.pt",
                self.config.lang,
            )
            opt_fp = download_or_load(
                f"misc/{self.ocr_opt}.txt",
                self.config.lang,
            )
            model = brainocr.Reader(
                self.config.lang,
                det_model_ckpt_fp=det_model_path,
                rec_model_ckpt_fp=rec_model_path,
                opt_fp=opt_fp,
                device=device,
            )
            model.detector.to(device)
            model.recognizer.to(device)
            return PororoOCR(model, self.config)


[docs]class PororoOCR(PororoSimpleBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

    def _postprocess(self, ocr_results, detail: bool = False):
        """
        Post-process for OCR result

        Args:
            ocr_results (list): list contains result of OCR
            detail (bool): if True, returned to include details. (bounding poly, vertices, etc)

        """
        sorted_ocr_results = sorted(
            ocr_results,
            key=lambda x: (
                x[0][0][1],
                x[0][0][0],
            ),
        )

        if not detail:
            return [
                sorted_ocr_results[i][-1]
                for i in range(len(sorted_ocr_results))
            ]

        result_dict = {
            "description": list(),
            "bounding_poly": list(),
        }

        for ocr_result in sorted_ocr_results:
            vertices = list()

            for vertice in ocr_result[0]:
                vertices.append({
                    "x": vertice[0],
                    "y": vertice[1],
                })

            result_dict["description"].append(ocr_result[1])
            result_dict["bounding_poly"].append({
                "description": ocr_result[1],
                "vertices": vertices
            })

        return result_dict

[docs]    def predict(self, image_path: str, **kwargs):
        """
        Conduct Optical Character Recognition (OCR)

        Args:
            image_path (str): the image file path
            detail (bool): if True, returned to include details. (bounding poly, vertices, etc)

        """
        detail = kwargs.get("detail", False)

        return self._postprocess(
            self._model(
                image_path,
                skip_details=False,
                batch_size=1,
                paragraph=True,
            ),
            detail,
        )