Coverage for apps/inners/use_cases/document_processor/summary_document_processor.py: 37%
27 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-22 19:03 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-22 19:03 +0000
1from typing import List
3from langchain_community.chat_models import ChatLiteLLM
4from langchain_core.messages import BaseMessage, HumanMessage
5from langchain_core.output_parsers import StrOutputParser
6from langchain_core.prompts import PromptTemplate
7from langchain_core.runnables import RunnableSerializable
8from unstructured.documents.elements import Table, Image
11class SummaryDocumentProcessor:
13 async def summarize_tables(self, tables: List[Table], llm_model: ChatLiteLLM) -> List[str]:
14 prompt: PromptTemplate = PromptTemplate(
15 template="""
16 <instruction>
17 Give a concise passage summary of the table that is well optimized for retrieval. These summary will be embedded and used to retrieve the table. Ensure the output does not re-explain the instruction.
18 <instruction/>
19 <table>
20 {table}
21 <table/>
22 """,
23 input_variables=["table"]
24 )
26 batch_messages: List[List[BaseMessage]] = []
27 for table in tables:
28 text: str = prompt.format(
29 table=table.text
30 )
31 messages: List[BaseMessage] = [
32 HumanMessage(
33 content=[
34 {
35 "type": "text",
36 "text": text
37 }
38 ]
39 )
40 ]
41 batch_messages.append(messages)
43 chain: RunnableSerializable = llm_model | StrOutputParser()
44 generated_summaries: List[str] = await chain.abatch(
45 inputs=batch_messages
46 )
48 return generated_summaries
50 async def summarize_images(self, images: List[Image], llm_model: ChatLiteLLM) -> List[str]:
51 prompt = """Instruction: Give a concise passage summary of the image that is well optimized for retrieval. These summary will be embedded and used to retrieve the image. Ensure the output does not re-explain the instruction.
52 Image:"""
53 batch_messages: List[List[BaseMessage]] = []
54 for image in images:
55 messages: List[BaseMessage] = [
56 HumanMessage(
57 content=[
58 {
59 "type": "text",
60 "text": prompt
61 },
62 {
63 "type": "image_url",
64 "image_url": {
65 "url": f"data:{image.metadata.image_mime_type};base64,{image.metadata.image_base64}",
66 }
67 }
68 ]
69 )
70 ]
71 batch_messages.append(messages)
73 chain: RunnableSerializable = llm_model | StrOutputParser()
74 generated_summaries: List[str] = await chain.abatch(
75 inputs=batch_messages
76 )
78 return generated_summaries