In [None]:
!pip install sentence-transformers faiss-cpu PyPDF2 python-docx numpy


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.12.0


In [None]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document


In [None]:
def extract_text_from_file(file_path):
    """
    Extracts text content from PDF, DOCX, and TXT files.
    """
    text = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif file_path.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    return text.strip()


In [None]:
def chunk_text(text, chunk_size=300):
    """
    Splits a long text into smaller chunks for semantic embedding.
    """
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]


In [None]:
folder_path = "documents/"
documents = []
doc_sources = []

for file in os.listdir(folder_path):
    if file.endswith((".pdf", ".docx", ".txt")):
        path = os.path.join(folder_path, file)
        print(f"📄 Reading file: {file}")
        content = extract_text_from_file(path)
        chunks = chunk_text(content)
        documents.extend(chunks)
        doc_sources.extend([file] * len(chunks))

print(f"\nLoaded {len(documents)} text chunks from {len(os.listdir(folder_path))} files.")


📄 Reading file: data_science.docx
📄 Reading file: db_basics.txt
📄 Reading file: ai_intro.pdf

Loaded 3 text chunks from 3 files.


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
print("\nGenerating embeddings... (this may take a minute)")

embeddings = model.encode(documents, convert_to_numpy=True, show_progress_bar=True)
embeddings = embeddings.astype('float32')

# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

print(f"Embeddings shape: {embeddings.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Generating embeddings... (this may take a minute)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings shape: (3, 384)


In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = Cosine similarity
index.add(embeddings)
print(f"FAISS index created with {index.ntotal} vectors.")


FAISS index created with 3 vectors.


In [None]:
import re
import textwrap

def clean_text(text):
    """
    Cleans Markdown symbols and extra whitespace from extracted text.
    """
    text = re.sub(r'[#=*`~_-]+', '', text)  # remove markdown headers, underlines, etc.
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # remove bold markers
    text = re.sub(r'\s+', ' ', text).strip()  # normalize spaces
    return text


def semantic_search_best(query, top_k=1, wrap_width=100, similarity_threshold=0.35, snippet_length=300):
    """
    Performs semantic search and returns the most relevant file(s)
    with a short, word-wrapped snippet preview.
    """
    query_embedding = model.encode([query]).astype('float32')
    faiss.normalize_L2(query_embedding)

    D, I = index.search(query_embedding, top_k)

    print("\nTop Semantic Search Result(s):")
    print("=" * 120)

    results_shown = 0

    for rank, idx in enumerate(I[0]):
        score = D[0][rank]
        if score < similarity_threshold:
            continue  # skip weak matches

        snippet = clean_text(documents[idx])[:snippet_length]  # limit to snippet_length chars
        wrapped_snippet = textwrap.fill(snippet, width=wrap_width)

        print(f"\nRank {rank + 1}")
        print(f"Source File     : {doc_sources[idx]}")
        print(f"Similarity Score: {score:.4f}")
        print("-" * 120)
        print(f"Preview Snippet:\n{wrapped_snippet}")
        print("=" * 120)
        results_shown += 1

    if results_shown == 0:
        print("No strong semantic matches found for your query.")
""

''

In [None]:
semantic_search_best("applications of artificial intelligence")


Top Semantic Search Result(s):

Rank 1
Source File     : ai_intro.pdf
Similarity Score: 0.5430
------------------------------------------------------------------------------------------------------------------------
Preview Snippet:
Artificial Intelligence (AI) Introduction Artificial Intelligence refers to the simulation of human
intelligence in machines that are programmed to think and act like humans. The core idea is to
enable computers to perform tasks such as reasoning, learning, perception, and decisionmaking.
Branches o


In [None]:
semantic_search_best("database systems and AI", top_k=3)


Top Semantic Search Result(s):

Rank 1
Source File     : ai_intro.pdf
Similarity Score: 0.5042
------------------------------------------------------------------------------------------------------------------------
Preview Snippet:
Artificial Intelligence (AI) Introduction Artificial Intelligence refers to the simulation of human
intelligence in machines that are programmed to think and act like humans. The core idea is to
enable computers to perform tasks such as reasoning, learning, perception, and decisionmaking.
Branches o

Rank 2
Source File     : db_basics.txt
Similarity Score: 0.4668
------------------------------------------------------------------------------------------------------------------------
Preview Snippet:
Database Basics Databases are structured systems used to store, manage, and retrieve data
efficiently. They play a crucial role in modern applications ranging from web services to analytics
platforms. Types of Databases 1. Relational Databases (RDBMS) — Store da