Coverage for apps/inners/use_cases/utilities/annotater.py: 27%

44 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-09-22 19:03 +0000

1import base64 

2import hashlib 

3import os 

4import re 

5from pathlib import Path 

6from typing import List 

7 

8from txtmarker.factory import Factory 

9 

10from apps.outers.settings.temp_datastore_setting import TempDatastoreSetting 

11 

12 

13class Annotater: 

14 

15 def __init__( 

16 self, 

17 temp_datastore_setting: TempDatastoreSetting 

18 ): 

19 self.temp_datastore_setting: TempDatastoreSetting = temp_datastore_setting 

20 

21 def annotate( 

22 self, 

23 labels: List[str], 

24 documents: List[str], 

25 input_file_data: bytes, 

26 ) -> bytes: 

27 input_file_name: str = f"annotater_input_{hashlib.md5(input_file_data).hexdigest()}" 

28 input_file_extension: str = ".pdf" 

29 input_file_path: Path = self.temp_datastore_setting.TEMP_DATASTORE_PATH / Path( 

30 f"{input_file_name}{input_file_extension}") 

31 output_file_name: str = f"annotater_output_{hashlib.md5(input_file_data).hexdigest()}" 

32 output_file_extension: str = ".pdf" 

33 output_file_path: Path = self.temp_datastore_setting.TEMP_DATASTORE_PATH / Path( 

34 f"/{output_file_name}{output_file_extension}") 

35 

36 with open(input_file_path, "wb") as file: 

37 file.write(input_file_data) 

38 

39 highlights = [] 

40 for label, document in zip(labels, documents): 

41 highlight = (label, document) 

42 highlights.append(highlight) 

43 

44 highlighter = Factory.create( 

45 extension="pdf", 

46 formatter=self.formatter, 

47 chunk=4 

48 ) 

49 

50 highlighter.highlight( 

51 infile=str(input_file_path), 

52 outfile=str(output_file_path), 

53 highlights=highlights 

54 ) 

55 

56 with open(output_file_path, "rb") as file: 

57 output_file_data = base64.b64encode(file.read()) 

58 

59 os.remove(input_file_path) 

60 os.remove(output_file_path) 

61 

62 return output_file_data 

63 

64 def formatter(self, text): 

65 """ 

66 Custom formatter that is passed to PDF Annotation method. This logic maps data cleansing logic in paperetl. 

67 

68 Reference: https://github.com/neuml/paperetl/blob/master/src/python/paperetl/text.py 

69 

70 Args: 

71 text: input text 

72 

73 Returns: 

74 clean text 

75 """ 

76 

77 # List of patterns 

78 patterns = [] 

79 

80 # Remove emails 

81 patterns.append(r"\w+@\w+(\.[a-z]{2,})+") 

82 

83 # Remove urls 

84 patterns.append(r"http(s)?\:\/\/\S+") 

85 

86 # Remove single characters repeated at least 3 times (ex. j o u r n a l) 

87 patterns.append(r"(^|\s)(\w\s+){3,}") 

88 

89 # Remove citations references (ex. [3] [4] [5]) 

90 patterns.append(r"(\[\d+\]\,?\s?){3,}(\.|\,)?") 

91 

92 # Remove citations references (ex. [3, 4, 5]) 

93 patterns.append(r"\[[\d\,\s]+\]") 

94 

95 # Remove citations references (ex. (NUM1) repeated at least 3 times with whitespace 

96 patterns.append(r"(\(\d+\)\s){3,}") 

97 

98 # Build regex pattern 

99 pattern = re.compile("|".join([f"({p})" for p in patterns])) 

100 

101 # Clean/transform text 

102 text = pattern.sub(" ", text) 

103 

104 # Remove extra spacing either caused by replacements or already in text 

105 text = re.sub(r" {2,}|\.{2,}", " ", text) 

106 

107 # Limit to alphanumeric characters 

108 text = re.sub(r"[^A-Za-z0-9]", "", text) 

109 

110 return text