Coverage for apps/inners/use_cases/document_processor/summary_document_processor.py: 37%

27 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-09-22 19:03 +0000

1from typing import List 

2 

3from langchain_community.chat_models import ChatLiteLLM 

4from langchain_core.messages import BaseMessage, HumanMessage 

5from langchain_core.output_parsers import StrOutputParser 

6from langchain_core.prompts import PromptTemplate 

7from langchain_core.runnables import RunnableSerializable 

8from unstructured.documents.elements import Table, Image 

9 

10 

11class SummaryDocumentProcessor: 

12 

13 async def summarize_tables(self, tables: List[Table], llm_model: ChatLiteLLM) -> List[str]: 

14 prompt: PromptTemplate = PromptTemplate( 

15 template=""" 

16 <instruction> 

17 Give a concise passage summary of the table that is well optimized for retrieval. These summary will be embedded and used to retrieve the table. Ensure the output does not re-explain the instruction. 

18 <instruction/> 

19 <table> 

20 {table} 

21 <table/> 

22 """, 

23 input_variables=["table"] 

24 ) 

25 

26 batch_messages: List[List[BaseMessage]] = [] 

27 for table in tables: 

28 text: str = prompt.format( 

29 table=table.text 

30 ) 

31 messages: List[BaseMessage] = [ 

32 HumanMessage( 

33 content=[ 

34 { 

35 "type": "text", 

36 "text": text 

37 } 

38 ] 

39 ) 

40 ] 

41 batch_messages.append(messages) 

42 

43 chain: RunnableSerializable = llm_model | StrOutputParser() 

44 generated_summaries: List[str] = await chain.abatch( 

45 inputs=batch_messages 

46 ) 

47 

48 return generated_summaries 

49 

50 async def summarize_images(self, images: List[Image], llm_model: ChatLiteLLM) -> List[str]: 

51 prompt = """Instruction: Give a concise passage summary of the image that is well optimized for retrieval. These summary will be embedded and used to retrieve the image. Ensure the output does not re-explain the instruction. 

52 Image:""" 

53 batch_messages: List[List[BaseMessage]] = [] 

54 for image in images: 

55 messages: List[BaseMessage] = [ 

56 HumanMessage( 

57 content=[ 

58 { 

59 "type": "text", 

60 "text": prompt 

61 }, 

62 { 

63 "type": "image_url", 

64 "image_url": { 

65 "url": f"data:{image.metadata.image_mime_type};base64,{image.metadata.image_base64}", 

66 } 

67 } 

68 ] 

69 ) 

70 ] 

71 batch_messages.append(messages) 

72 

73 chain: RunnableSerializable = llm_model | StrOutputParser() 

74 generated_summaries: List[str] = await chain.abatch( 

75 inputs=batch_messages 

76 ) 

77 

78 return generated_summaries