In [1]:
# Step 1: Install necessary packages
!pip install langchain langchain-community pypdf sentence-transformers faiss-cpu transformers

import os
import logging
import textwrap
from google.colab import files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Step 2: Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 3: Define the RAG system class
class LocalRAGSystem:
    def __init__(self):
        self.documents = []
        self.vector_store = None
        self.embeddings = None
        self.llm = None
        self.qa_chain = None

    # Step 4: Upload PDFs to Colab
    def upload_pdfs(self):
        try:
            uploaded = files.upload()
            pdf_paths = list(uploaded.keys())
            logger.info(f"Uploaded {len(pdf_paths)} PDFs: {pdf_paths}")
            return pdf_paths
        except Exception as e:
            logger.error(f"Error uploading PDFs: {e}")
            raise

    # Step 5: Load and parse PDF documents
    def load_documents(self, pdf_paths):
        for pdf_path in pdf_paths:
            try:
                loader = PyPDFLoader(pdf_path)
                documents = loader.load()
                self.documents.extend(documents)
                logger.info(f"Loaded {len(documents)} pages from {pdf_path}")
            except Exception as e:
                logger.error(f"Error loading {pdf_path}: {e}")
        logger.info(f"Loaded {len(self.documents)} document pages in total.")

    # Step 6: Split documents into chunks for embeddings
    def split_documents(self, chunk_size=1000, chunk_overlap=200):
        try:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            self.document_chunks = text_splitter.split_documents(self.documents)
            logger.info(f"Split into {len(self.document_chunks)} chunks.")
        except Exception as e:
            logger.error(f"Error splitting documents: {e}")
            raise

    # Step 7: Setup embedding model for vector store
    def setup_embeddings(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        try:
            self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
            logger.info(f"Set up embeddings using {model_name}")
        except Exception as e:
            logger.error(f"Error setting up embeddings: {e}")
            raise

    # Step 8: Create a vector store using FAISS
    def create_vector_store(self):
        try:
            self.vector_store = FAISS.from_documents(self.document_chunks, self.embeddings)
            logger.info("Created FAISS vector store.")
        except Exception as e:
            logger.error(f"Error creating vector store: {e}")
            raise

    # Step 9: Setup a local language model
    def setup_local_llm(self, model_id="google/flan-t5-base", device="auto"):
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=device)
            pipe = pipeline(
                "text2text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512,
                temperature=0.7
            )
            self.llm = HuggingFacePipeline(pipeline=pipe)
            logger.info(f"Set up local LLM using {model_id}")
        except Exception as e:
            logger.error(f"Error setting up local LLM: {e}")
            raise

    # Step 10: Setup the RetrievalQA chain
    def setup_qa_chain(self, k=3):
        try:
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=self.vector_store.as_retriever(search_kwargs={"k": k})
            )
            logger.info(f"Set up QA chain with k={k}.")
        except Exception as e:
            logger.error(f"Error setting up QA chain: {e}")
            raise

    # Step 11: Answer questions using the RAG system
    def answer_question(self, question):
        try:
            answer = self.qa_chain.run(question)
            logger.info(f"Q: {question}\nA: {answer}")
            return answer
        except Exception as e:
            logger.error(f"Error answering question: {e}")
            raise

    # Step 12: Run the setup process end-to-end
    def run_setup(self, chunk_size=1000, chunk_overlap=200, model_id="google/flan-t5-base", k=3):
        try:
            pdf_paths = self.upload_pdfs()
            self.load_documents(pdf_paths)
            self.split_documents(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
            self.setup_embeddings()
            self.create_vector_store()
            self.setup_local_llm(model_id=model_id)
            self.setup_qa_chain(k=k)
            logger.info("RAG system is ready to use!")
        except Exception as e:
            logger.error(f"Error during setup: {e}")
            raise

# Step 13: Example usage with wrapped output
if __name__ == "__main__":
    import textwrap

    rag = LocalRAGSystem()
    rag.run_setup(chunk_size=1000, chunk_overlap=200, model_id="google/flan-t5-base", k=3)

    wrapper = textwrap.TextWrapper(width=80)

    question1 = "What is the main topic of these documents?"
    answer1 = rag.answer_question(question1)
    print(f"Q: {question1}\nA: {wrapper.fill(answer1)}\n")

    question2 = "Summarize the key points from the documents."
    answer2 = rag.answer_question(question2)
    print(f"Q: {question2}\nA: {wrapper.fill(answer2)}\n")


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0

Saving ragg.pdf to ragg.pdf


  self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  self.llm = HuggingFacePipeline(pipeline=pipe)
  answer = self.qa_chain.run(question)
Token indices sequence length is longer than the specified maximum sequence length for this model (689 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Q: What is the main topic of these documents?
A: Jeopardy questions often contain two separate pieces of information, and RAG-
Token may perform best because it can generate responses that combine content
from several documents.

Q: Summarize the key points from the documents.
A: Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks Appendices for
Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks A
Implementation Details For Open-domain QA we report test numbers using 15
retrieved documents for RAG-Token models. For RAG-Sequence models, we report
test results using 50 retrieved documents, and we use the Thorough Decoding
approach since answers are generally short. We use greedy decoding for QA as we
did not find beam search improved results. For Open-MSMarco and Jeopardy
question generation, we report test numbers using ten retrieved documents for
both RAG-Token and RAG-Sequence, and we also train a BART-large model as a
baseline. We use a beam size of four, a