[Note]Option 2 is suitable when multimodal LLMs cannot be used for answer synthesis (e.g., due to cost or other limitations).
Data Loading
Splitting PDF Text and Images
Using partition_pdf provided by Unstructured, you can extract text and images.
To extract images, use the following:
extract_images_in_pdf=True
If you want to process only text:
extract_images_in_pdf=False
# file path
fpath = "data/"
fname = "sample.pdf"
import os
from langchain_text_splitters import CharacterTextSplitter
from unstructured.partition.pdf import partition_pdf
# Extracting Elements from a PDF
def extract_pdf_elements(path, fname):
"""
Extract images, tables, and text snippets from a PDF file.
path: File path to save the image (.jpg) to
fname: File name
"""
return partition_pdf(
filename=os.path.join(path, fname),
extract_images_in_pdf=True, # Enable image extraction in PDFs
infer_table_structure=True, # Enable table structure inference
chunking_strategy="by_title", # Fragmenting text by title
max_characters=4000, # Maximum character count
new_after_n_chars=3800, # Create new fragments after this number of characters
combine_text_under_n_chars=2000, # Text with this number of characters or less will use the Combine
image_output_dir_path=path, # Path to image output directory
)
# Categorize elements by type
def categorize_elements(raw_pdf_elements):
"""
Categorize elements extracted from a PDF into tables and text.
raw_pdf_elements: list of unstructured.documents.elements
"""
tables = [] # Table Save List
texts = [] # Save Text List
for element in raw_pdf_elements:
if "unstructured.documents.elements.Table" in str(type(element)):
tables.append(str(element)) # Add table elements
elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
texts.append(str(element)) # Add text elements
return texts, tables
# Extract elements
raw_pdf_elements = extract_pdf_elements(fpath, fname)
# Extract text, tables
texts, tables = categorize_elements(raw_pdf_elements)
# Optional: Enforce a specific token size for text
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=4000, chunk_overlap=0 # Split text into 4000 token size, no duplicates
)
joined_texts = " ".join(texts) # Combine text
texts_4k_token = text_splitter.split_text(joined_texts) # Execute a split
len(texts_4k_token)
1
Multi-Vector Search Engine
Using the multi-vector-retriever, you can index summaries of images (and/or text, tables) while retrieving the original images (along with the original text or tables).
Text and Table Summarization
To generate summaries for tables and optionally text, we will use GPT-4-turbo.
If you are working with large chunk sizes (e.g., 4k token chunks as set above), text summarization is recommended.
The summaries are used for retrieving the original tables and/or original text chunks.
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
# Create a summary of a text element
def generate_text_summaries(texts, tables, summarize_texts=False):
"""
Text element summary
texts: List of strings
tables: List of strings
summarize_texts: Determines whether to summarize texts. True/False
"""
# Setting the prompt
prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
These summaries will be embedded and used to retrieve the raw text or table elements. \
Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
# Initializing an empty list for summaries
text_summaries = []
table_summaries = []
# Apply when a summary is requested for the provided text
if texts and summarize_texts:
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
elif texts:
text_summaries = texts
# Apply to the provided table
if tables:
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
return text_summaries, table_summaries
# Get text, table summaries
text_summaries, table_summaries = generate_text_summaries(
texts_4k_token, tables, summarize_texts=True
)
Image Summarization
We will use GPT-4o to generate summaries for images.
The images are passed as base64-encoded data.
import base64
import os
from langchain_core.messages import HumanMessage
def encode_image(image_path):
# Encode the image file as a base64 string.
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def image_summarize(img_base64, prompt):
# Generate an image summary.
chat = ChatOpenAI(model="gpt-4o", max_tokens=2048)
msg = chat.invoke(
[
HumanMessage(
content=[
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
]
)
]
)
return msg.content
def generate_img_summaries(path):
"""
Generates a summary of the images and a base64 encoded string.
path: The path to the list of .jpg files extracted by Unstructured.
"""
# A list to store base64-encoded images in
img_base64_list = []
# List to save image summaries
image_summaries = []
# Prompt for summarizing
prompt = """You are an assistant tasked with summarizing images for retrieval. \
These summaries will be embedded and used to retrieve the raw image. \
Give a concise summary of the image that is well optimized for retrieval."""
# Apply to images
for img_file in sorted(os.listdir(path)):
if img_file.startswith("10-") and img_file.endswith(".png"):
img_path = os.path.join(path, img_file)
base64_image = encode_image(img_path)
img_base64_list.append(base64_image)
image_summaries.append(image_summarize(base64_image, prompt))
return img_base64_list, image_summaries
# Run an image summary
img_base64_list, image_summaries = generate_img_summaries('assets/')
Store the original text, tables, and images in the docstore.
Save text summaries, table summaries, and image summaries in the vectorstore for efficient semantic search.
Explaining the Process of Creating a Multi-Vector Search Engine for Indexing and Retrieving Various Data Types (Text, Tables, Images)
Initialize the storage layer using InMemoryStore.
Create a MultiVectorRetriever to index summarized data but configure it to return the original text or images.
Include the process of adding summaries and original data for each data type (text, tables, images) to the vectorstore and docstore:
Generate a unique doc_id for each document.
Add the summarized data to the vectorstore and store the original data along with the doc_id in the docstore.
Check conditions to ensure that only non-empty summaries are added for each data type.
Use the Chroma vector store to index summaries and generate embeddings using the OpenAIEmbeddings function.
The resulting multi-vector search engine indexes summaries for various data types and ensures that original data is returned during searches.
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
def create_multi_vector_retriever(
vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
"""
Create a retriever that indexes the summary but returns the original image or text.
"""
# Initialize the storage tier
store = InMemoryStore()
id_key = "doc_id"
# Create a multi-vector retriever
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
# Helper function for adding documents to vector store and document store
def add_documents(retriever, doc_summaries, doc_contents):
doc_ids = [
str(uuid.uuid4()) for _ in doc_contents
] # Create a unique ID for each document content
summary_docs = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(doc_summaries)
]
retriever.vectorstore.add_documents(
summary_docs
) # Add a summary document to a vector store
retriever.docstore.mset(
list(zip(doc_ids, doc_contents))
) # Add a document content to a document store
# Add text, tables, and images
if text_summaries:
add_documents(retriever, text_summaries, texts)
if table_summaries:
add_documents(retriever, table_summaries, tables)
if image_summaries:
add_documents(retriever, image_summaries, images)
return retriever
# Vector store to use for indexing summaries
vectorstore = Chroma(
persist_directory="sample-rag-multi-modal", embedding_function=OpenAIEmbeddings()
)
# Create a retriever
retriever_multi_vector_img = create_multi_vector_retriever(
vectorstore,
text_summaries,
texts,
table_summaries,
tables,
image_summaries,
img_base64_list,
)
RAG
Building the Retriever
The retrieved documents must be assigned to the correct sections of the GPT-4o prompt template.
The following describes how to process Base64-encoded images and text and use them to construct a multimodal question-answering (QA) chain:
Verify if a Base64-encoded string is an image. Supported image formats include JPG, PNG, GIF, and WEBP.
Resize the Base64-encoded image to the given dimensions.
Separate Base64-encoded images and text from a document set.
Use the separated images and text to construct messages that will serve as inputs to the multimodal QA chain. This process involves creating messages that include image URLs and text information.
Construct the multimodal QA chain. This chain generates responses to questions based on the provided image and text information. The model used is ChatOpenAI, specifically the gpt-4o model.
This process outlines the implementation of a multimodal QA system that leverages both image and text data to generate responses to questions. It includes Base64 encoding and decoding for image data, image resizing, and the integration of image and text information to produce responses.
import io
import re
from IPython.display import HTML, display
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from PIL import Image
def plt_img_base64(img_base64):
"""Display base64 encoded strings as image"""
# Create an HTML img tag that uses a base64 string as its source
image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
# Rendering HTML to display images
display(HTML(image_html))
def looks_like_base64(sb):
"""Check if the string appears to be Base64"""
return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None
def is_image_data(b64data):
"""
Check if the Base64 data is an image by inspecting the beginning
"""
image_signatures = {
b"\xff\xd8\xff": "jpg",
b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a": "png",
b"\x47\x49\x46\x38": "gif",
b"\x52\x49\x46\x46": "webp",
}
try:
header = base64.b64decode(b64data)[:8] # Decode and retrieve the first 8 bytes
for sig, format in image_signatures.items():
if header.startswith(sig):
return True
return False
except Exception:
return False
def resize_base64_image(base64_string, size=(128, 128)):
"""
Resizing an image encoded as a Base64 string
"""
# Decode Base64 strings
img_data = base64.b64decode(base64_string)
img = Image.open(io.BytesIO(img_data))
# Resize an image
resized_img = img.resize(size, Image.LANCZOS)
# Save the adjusted image to a byte buffer
buffered = io.BytesIO()
resized_img.save(buffered, format=img.format)
# Encoding adjusted images to Base64
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def split_image_text_types(docs):
"""
Separate base64-encoded images and text
"""
b64_images = []
texts = []
for doc in docs:
# Extract page_content if the document is of type Document
if isinstance(doc, Document):
doc = doc.page_content
if looks_like_base64(doc) and is_image_data(doc):
doc = resize_base64_image(doc, size=(1300, 600))
b64_images.append(doc)
else:
texts.append(doc)
return {"images": b64_images, "texts": texts}
def img_prompt_func(data_dict):
"""
Combine contexts into a single string
"""
formatted_texts = "\n".join(data_dict["context"]["texts"])
messages = []
# If you have an image, add it to the message
if data_dict["context"]["images"]:
for image in data_dict["context"]["images"]:
image_message = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
}
messages.append(image_message)
# Add text for analysis
text_message = {
"type": "text",
"text": (
"You are financial analyst tasking with providing investment advice.\n"
"You will be given a mixed of text, tables, and image(s) usually of charts or graphs.\n"
"Use this information to provide investment advice related to the user question. Answer in English. Do NOT translate company names.\n"
f"User-provided question: {data_dict['question']}\n\n"
"Text and / or tables:\n"
f"{formatted_texts}"
),
}
messages.append(text_message)
return [HumanMessage(content=messages)]
def multi_modal_rag_chain(retriever):
"""
Multimodal RAG Chains
"""
# Multimodal LLM
model = ChatOpenAI(temperature=0, model="gpt-4o", max_tokens=2048)
# RAG Pipeline
chain = (
{
"context": retriever | RunnableLambda(split_image_text_types),
"question": RunnablePassthrough(),
}
| RunnableLambda(img_prompt_func)
| model
| StrOutputParser()
)
return chain
# Create a RAG chain
chain_multimodal_rag = multi_modal_rag_chain(retriever_multi_vector_img)
Verification
When we search for images related to a question, we receive relevant images in return.
# Execute the search query.
query = "Please provide the names of companies that are interesting investment opportunities based on EV/NTM and NTM revenue growth rates. Do you consider the EV/NTM multiple and historical data?"
# Search for 6 documents related to the query.
docs = retriever_multi_vector_img.invoke(query, limit=6)
# Check the number of documents.
len(docs) # Return the number of retrieved documents.
4
# Verify the search results
query = "What are the EV/NTM and NTM revenue growth rates for MongoDB, Cloudflare, and Datadog?"
docs = retriever_multi_vector_img.invoke(query, limit=6)
# Check the number of documents
len(docs)
4
# Return the relevant images.
plt_img_base64(docs[2])
Verification
Let’s revisit the images we stored to understand why this works.
# Display the image at the 2th index of the `img_base64_list` in Base64 format.
plt_img_base64(img_base64_list[2])
Here is the corresponding summary, which we embedded for similarity search.
It is quite reasonable that this image was retrieved based on its similarity to the summary of our query.
image_summaries[2] # Access the 2th element of the `image_summaries` list.
'Table comparing key financial metrics of ten companies: EV/NTM Rev, EV/2024 Rev, EV/NTM FCF, NTM Rev Growth, Gross Margin, Operating Margin, FCF Margin, and % in Top 10 Multiple LTM. Companies include Snowflake, MongoDB, Palantir, and others. Average and median values are highlighted. Published by Altimeter.'
RAG
Now, let's run RAG and test its ability to synthesize answers to our questions.
# Execute the RAG chain.
print(chain_multimodal_rag.invoke(query))