Coverage for apps/inners/use_cases/document_processor/summary_document

1from typing import List

3from langchain_community.chat_models import ChatLiteLLM

4from langchain_core.messages import BaseMessage, HumanMessage

5from langchain_core.output_parsers import StrOutputParser

6from langchain_core.prompts import PromptTemplate

7from langchain_core.runnables import RunnableSerializable

8from unstructured.documents.elements import Table, Image

11class SummaryDocumentProcessor:

13 async def summarize_tables(self, tables: List[Table], llm_model: ChatLiteLLM) -> List[str]:

14 prompt: PromptTemplate = PromptTemplate(

15 template="""

16 <instruction>

17 Give a concise passage summary of the table that is well optimized for retrieval. These summary will be embedded and used to retrieve the table. Ensure the output does not re-explain the instruction.

18 <instruction/>

19 <table>

20 {table}

21 <table/>

22 """,

23 input_variables=["table"]

24 )

26 batch_messages: List[List[BaseMessage]] = []

27 for table in tables:

28 text: str = prompt.format(

29 table=table.text

30 )

31 messages: List[BaseMessage] = [

32 HumanMessage(

33 content=[

34 {

35 "type": "text",

36 "text": text

37 }

38 ]

39 )

40 ]

41 batch_messages.append(messages)

43 chain: RunnableSerializable = llm_model | StrOutputParser()

44 generated_summaries: List[str] = await chain.abatch(

45 inputs=batch_messages

46 )

48 return generated_summaries

50 async def summarize_images(self, images: List[Image], llm_model: ChatLiteLLM) -> List[str]:

51 prompt = """Instruction: Give a concise passage summary of the image that is well optimized for retrieval. These summary will be embedded and used to retrieve the image. Ensure the output does not re-explain the instruction.

52 Image:"""

53 batch_messages: List[List[BaseMessage]] = []

54 for image in images:

55 messages: List[BaseMessage] = [

56 HumanMessage(

57 content=[

58 {

59 "type": "text",

60 "text": prompt

61 },

62 {

63 "type": "image_url",

64 "image_url": {

65 "url": f"data:{image.metadata.image_mime_type};base64,{image.metadata.image_base64}",

66 }

67 }

68 ]

69 )

70 ]

71 batch_messages.append(messages)

73 chain: RunnableSerializable = llm_model | StrOutputParser()

74 generated_summaries: List[str] = await chain.abatch(

75 inputs=batch_messages

76 )

78 return generated_summaries

Coverage for apps/inners/use_cases/document_processor/summary_document_processor.py: 37%

27 statements