Coverage for apps/inners/use_cases/utilities/annotater.py: 27%
44 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-22 19:03 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-22 19:03 +0000
1import base64
2import hashlib
3import os
4import re
5from pathlib import Path
6from typing import List
8from txtmarker.factory import Factory
10from apps.outers.settings.temp_datastore_setting import TempDatastoreSetting
13class Annotater:
15 def __init__(
16 self,
17 temp_datastore_setting: TempDatastoreSetting
18 ):
19 self.temp_datastore_setting: TempDatastoreSetting = temp_datastore_setting
21 def annotate(
22 self,
23 labels: List[str],
24 documents: List[str],
25 input_file_data: bytes,
26 ) -> bytes:
27 input_file_name: str = f"annotater_input_{hashlib.md5(input_file_data).hexdigest()}"
28 input_file_extension: str = ".pdf"
29 input_file_path: Path = self.temp_datastore_setting.TEMP_DATASTORE_PATH / Path(
30 f"{input_file_name}{input_file_extension}")
31 output_file_name: str = f"annotater_output_{hashlib.md5(input_file_data).hexdigest()}"
32 output_file_extension: str = ".pdf"
33 output_file_path: Path = self.temp_datastore_setting.TEMP_DATASTORE_PATH / Path(
34 f"/{output_file_name}{output_file_extension}")
36 with open(input_file_path, "wb") as file:
37 file.write(input_file_data)
39 highlights = []
40 for label, document in zip(labels, documents):
41 highlight = (label, document)
42 highlights.append(highlight)
44 highlighter = Factory.create(
45 extension="pdf",
46 formatter=self.formatter,
47 chunk=4
48 )
50 highlighter.highlight(
51 infile=str(input_file_path),
52 outfile=str(output_file_path),
53 highlights=highlights
54 )
56 with open(output_file_path, "rb") as file:
57 output_file_data = base64.b64encode(file.read())
59 os.remove(input_file_path)
60 os.remove(output_file_path)
62 return output_file_data
64 def formatter(self, text):
65 """
66 Custom formatter that is passed to PDF Annotation method. This logic maps data cleansing logic in paperetl.
68 Reference: https://github.com/neuml/paperetl/blob/master/src/python/paperetl/text.py
70 Args:
71 text: input text
73 Returns:
74 clean text
75 """
77 # List of patterns
78 patterns = []
80 # Remove emails
81 patterns.append(r"\w+@\w+(\.[a-z]{2,})+")
83 # Remove urls
84 patterns.append(r"http(s)?\:\/\/\S+")
86 # Remove single characters repeated at least 3 times (ex. j o u r n a l)
87 patterns.append(r"(^|\s)(\w\s+){3,}")
89 # Remove citations references (ex. [3] [4] [5])
90 patterns.append(r"(\[\d+\]\,?\s?){3,}(\.|\,)?")
92 # Remove citations references (ex. [3, 4, 5])
93 patterns.append(r"\[[\d\,\s]+\]")
95 # Remove citations references (ex. (NUM1) repeated at least 3 times with whitespace
96 patterns.append(r"(\(\d+\)\s){3,}")
98 # Build regex pattern
99 pattern = re.compile("|".join([f"({p})" for p in patterns]))
101 # Clean/transform text
102 text = pattern.sub(" ", text)
104 # Remove extra spacing either caused by replacements or already in text
105 text = re.sub(r" {2,}|\.{2,}", " ", text)
107 # Limit to alphanumeric characters
108 text = re.sub(r"[^A-Za-z0-9]", "", text)
110 return text