Coverage for apps/inners/use_cases/utilities/annotater.py: 27%

1import base64

2import hashlib

3import os

4import re

5from pathlib import Path

6from typing import List

8from txtmarker.factory import Factory

10from apps.outers.settings.temp_datastore_setting import TempDatastoreSetting

13class Annotater:

15 def __init__(

16 self,

17 temp_datastore_setting: TempDatastoreSetting

18 ):

19 self.temp_datastore_setting: TempDatastoreSetting = temp_datastore_setting

21 def annotate(

22 self,

23 labels: List[str],

24 documents: List[str],

25 input_file_data: bytes,

26 ) -> bytes:

27 input_file_name: str = f"annotater_input_{hashlib.md5(input_file_data).hexdigest()}"

28 input_file_extension: str = ".pdf"

29 input_file_path: Path = self.temp_datastore_setting.TEMP_DATASTORE_PATH / Path(

30 f"{input_file_name}{input_file_extension}")

31 output_file_name: str = f"annotater_output_{hashlib.md5(input_file_data).hexdigest()}"

32 output_file_extension: str = ".pdf"

33 output_file_path: Path = self.temp_datastore_setting.TEMP_DATASTORE_PATH / Path(

34 f"/{output_file_name}{output_file_extension}")

36 with open(input_file_path, "wb") as file:

37 file.write(input_file_data)

39 highlights = []

40 for label, document in zip(labels, documents):

41 highlight = (label, document)

42 highlights.append(highlight)

44 highlighter = Factory.create(

45 extension="pdf",

46 formatter=self.formatter,

47 chunk=4

48 )

50 highlighter.highlight(

51 infile=str(input_file_path),

52 outfile=str(output_file_path),

53 highlights=highlights

54 )

56 with open(output_file_path, "rb") as file:

57 output_file_data = base64.b64encode(file.read())

59 os.remove(input_file_path)

60 os.remove(output_file_path)

62 return output_file_data

64 def formatter(self, text):

65 """

66 Custom formatter that is passed to PDF Annotation method. This logic maps data cleansing logic in paperetl.

68 Reference: https://github.com/neuml/paperetl/blob/master/src/python/paperetl/text.py

70 Args:

71 text: input text

73 Returns:

74 clean text

75 """

77 # List of patterns

78 patterns = []

80 # Remove emails

81 patterns.append(r"\w+@\w+(\.[a-z]{2,})+")

83 # Remove urls

84 patterns.append(r"http(s)?\:\/\/\S+")

86 # Remove single characters repeated at least 3 times (ex. j o u r n a l)

87 patterns.append(r"(^|\s)(\w\s+){3,}")

89 # Remove citations references (ex. [3] [4] [5])

90 patterns.append(r"(\[\d+\]\,?\s?){3,}(\.|\,)?")

92 # Remove citations references (ex. [3, 4, 5])

93 patterns.append(r"\[[\d\,\s]+\]")

95 # Remove citations references (ex. (NUM1) repeated at least 3 times with whitespace

96 patterns.append(r"(\(\d+\)\s){3,}")

98 # Build regex pattern

99 pattern = re.compile("|".join([f"({p})" for p in patterns]))

100

101 # Clean/transform text

102 text = pattern.sub(" ", text)

103

104 # Remove extra spacing either caused by replacements or already in text

105 text = re.sub(r" {2,}|\.{2,}", " ", text)

106

107 # Limit to alphanumeric characters

108 text = re.sub(r"[^A-Za-z0-9]", "", text)

109

110 return text