In [ ]:
# -*- coding: utf-8 -*-
"""
Refactored Code on Wed Mar 25 15:30:10 2026
Bible RAG - This program uses semantic search to provide responses to queries about the bible
@author: David DiPaola
"""
import os
import json
import sqlite3
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
class BibleRAG:
"""
RAG system bridging canonical Bible text and Personal SQLite Notes.
"""
def __init__(
self,
llm_model_path: str = "E:/Python/AI/gemma-2-9b-it-Q5_K_L.gguf",
#llm_model_path: str = "E:/Python/AI/Meta-Llama-3.1-8B-Instruct-Q4_K_L.gguf", # If you would like to use the Llama 3.1 8B model
embed_model_name: str = "all-mpnet-base-v2",
bible_path: str = 'E:/Python/bible/bible_web_embedding.json',
bible_embeddings_path: str = "E:/Python/bible/bible_embeddings.npy",
cross_encoder_name: str = "BAAI/bge-reranker-v2-m3",
device: Optional[str] = None,
max_context_tokens: int = 4096, # This was limited due to GPU constraints and optimizing speed
batch_size: int = 8, # Batch size limited to 8 due to VRAM fragmentation and llama-cpp’s need for contigous blocks of GPU space to perform matrix multiplication
):
self.llm_model_path = llm_model_path
self.embed_model_name = embed_model_name
self.bible_path = bible_path
self.bible_embeddings_path = bible_embeddings_path
self.cross_encoder_name = cross_encoder_name
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
self.max_context_tokens = max_context_tokens
self.batch_size = batch_size
self.llm = None
self.notes_index = None
self.notes_content = []
self._set_model_type()
self._load_bible_data()
self._load_embedding_models()
self._build_bible_faiss_index()
def _set_model_type(self):
path_lower = self.llm_model_path.lower()
if "Meta-Llama-3.1" in path_lower or "llama-3.2" in path_lower:
self.model_type = "llama"
elif "gemma-2" in path_lower:
self.model_type = "gemma"
else:
raise ValueError(f"Unknown or unsupported model type for path: {self.llm_model_path}")
def _load_bible_data(self):
with open(self.bible_path, 'r', encoding='utf-8') as bible:
self.bible_data = json.load(bible)
# Load Embed Model on CPU to save GPU space for LLM Model
# Purposely choose to run Embed Model and Cross Encoder on CPU - lost a few seconds of speed but saved ~35% VRAM.
def _load_embedding_models(self):
print("Loading SentenceTransformer on CPU...")
self.embed_model = SentenceTransformer(self.embed_model_name, device="cpu", trust_remote_code=True)
# Purposely choose to run Cross Encoder on CPU because it saved 35% of VRAM and only caused a few second hit.
print(f"Loading CrossEncoder on CPU: {self.cross_encoder_name}")
self.cross_encoder = CrossEncoder(self.cross_encoder_name, device="cpu")
def _build_bible_faiss_index(self):
# Extract only the book and verse wording for inference
bible_text = [f"{item['metadata']['book']} {item['metadata']['text_only']}" for item in self.bible_data]
if os.path.exists(self.bible_embeddings_path):
print("Loading cached Bible embeddings...")
embeddings = np.load(self.bible_embeddings_path)
else:
print("No cache found. Embedding Bible now...")
embeddings = self.embed_model.encode(
bible_text,
batch_size=self.batch_size,
show_progress_bar=True,
convert_to_numpy=True,
normalize_embeddings=True
)
np.save(self.bible_embeddings_path, embeddings)
dimension = embeddings.shape[1]
self.index = faiss.IndexFlatL2(dimension)
self.index.add(embeddings.astype('float32'))
print(f"FAISS index built with {self.index.ntotal} vectors.")
def _embed_query(self, query: str) -> np.ndarray:
"""Shared utility to embed user queries."""
return self.embed_model.encode(
[query],
batch_size=1,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=True
).astype("float32")
def retrieve_bible(self, query: str, k: int = 10, rerank: bool = True, bi_k: int = 40) -> List[Dict[str, Any]]:
"""
Two-stage retrieval pipeline (as defined in the MD specification):
1. Bi-Encoder (FAISS + all-mpnet-base-v2) → fast candidate retrieval
2. Cross-Encoder reranking → precision boost
Args:
query (str): User question / prompt
k (int): Final number of verses to return (spec recommends 5–8)
rerank (bool): Whether to apply cross-encoder reranking
bi_k (int): How many candidates to fetch from FAISS before reranking
(higher = better reranking quality, still very fast)
Returns:
list[dict]: Top-k results with content, scores, and full metadata
Example item:
{
"book": "John",
"chapter": 3,
"verse": 16,
"content": "For God so loved the world...",
"bi_score": 0.892,
"cross_score": 0.945,
"index": 1234
}
"""
query_emb = self._embed_query(query)
search_k = bi_k if rerank else k
distances, indices = self.index.search(query_emb, search_k)
candidates = []
for idx, dist in zip(indices[0], distances[0]):
doc = self.bible_data[idx].copy()
doc.update({
"bi_score": float(1.0 - dist),
"faiss_distance": float(dist),
"index": int(idx)
})
candidates.append(doc)
if not rerank or len(candidates) <= k:
candidates.sort(key=lambda x: x["bi_score"], reverse=True)
return candidates[:k]
cross_inputs = [(query, doc["content"]) for doc in candidates]
cross_scores = self.cross_encoder.predict(cross_inputs)
for i, score in enumerate(cross_scores):
candidates[i]["cross_score"] = float(score)
candidates.sort(key=lambda x: x["cross_score"], reverse=True)
return candidates[:k]
def index_notes(self, notes_manager: 'NotesManager'):
"""
Fetches notes from DB, embeds them, and builds a secondary FAISS index.
"""
raw_notes = notes_manager.get_all_notes()
if not raw_notes:
print("No notes found to index.")
return
self.notes_content = [f"Date: {n[1]} | Content: {n[0]} | Verses: {n[3]}" for n in raw_notes]
note_embeddings = self.embed_model.encode(
self.notes_content,
convert_to_numpy=True,
normalize_embeddings=True
).astype('float32')
self.notes_index = faiss.IndexFlatL2(note_embeddings.shape[1])
self.notes_index.add(note_embeddings)
print(f"Notes Index built with {len(raw_notes)} personal reflections.")
def retrieve_notes(self, query: str, k_notes: int = 2, rerank: bool = True, k_notes_total: int = 10, threshold: float = 0.01) -> List[Dict[str, Any]]:
if not self.notes_index:
return []
query_emb = self._embed_query(query)
note_distances, note_indices = self.notes_index.search(query_emb, k_notes_total)
personal_notes = []
for i, idx in enumerate(note_indices[0]):
if idx != -1:
personal_notes.append({
"notes_content": self.notes_content[idx],
"note_score": float(1.0 - note_distances[0][i])
})
if not rerank or not personal_notes:
personal_notes.sort(key=lambda x: x["note_score"], reverse=True)
return personal_notes[:k_notes]
notes_cross_inputs = [(query, note["notes_content"]) for note in personal_notes]
notes_cross_scores = self.cross_encoder.predict(notes_cross_inputs)
final_notes = []
for i, score in enumerate(notes_cross_scores):
if score > threshold:
personal_notes[i]["note_cross_score"] = float(score)
final_notes.append(personal_notes[i])
final_notes.sort(key=lambda x: x["note_cross_score"], reverse=True)
return final_notes[:k_notes]
def initialize_llm(self):
print(f"Loading LLM from {self.llm_model_path}...")
self.llm = LlamaCpp(
model_path=self.llm_model_path,
n_ctx=self.max_context_tokens,
max_tokens=768,
n_batch=self.batch_size,
n_gpu_layers=-1,
temperature=0.3,
repeat_penalty=1.1,
verbose=False,
f16_kv=False, # KV cache uses f8 (8-bit) instead of f16 (16-bit) saving 200 - 800 MB
use_mlock=True, # Locks the model weights in RAM so they are not swapped to disk by the OS, I have 64 GB of ram so no real contraint (running around 24 GB everything loaded)
top_p=0.95,
top_k=30,
model_kwargs={
"offload_kqv": False, # Moves the Key-Value cache to GPU true - left false to maximize GPU space (took speed hit)
"flash_attn": False, # Reduces VRAM usage (if your GPU supports it / not supported on my GPU) - left false
"context_shift": False # Experimental feature for long-context handling, may cause instability and not needed for my context window - left false
}
)
def _create_prompt_template(self) -> ChatPromptTemplate:
# Dictionary approach allows prompt optimization for each model independently in future
# Yes, repeating words of core section is verbose but for me this is cleaner and easier to modify independently
templates = {
"gemma": (
"<start_of_turn>user\n"
"You are a compassionate friend sharing wisdom from ONLY THE CONTEXT and my PERSONAL NOTES.\n\n"
"Rules:\n"
"1. Explain themes and connect passages.\n"
"2. Synthesize BIBLE CONTEXT with USER'S PERSONAL NOTES.\n"
"3. Cite references like (John 14:27).\n"
"4. Long-form, reflective essay style.\n"
"5. Conversational, heart-to-heart, respectful tone.\n"
"6. No lists. No 'therefore'. No 'Amen'. No repetition.\n"
"7. Stay within 768 tokens.\n\n"
"CONTEXT:\n{context}\n\n"
"PERSONAL NOTES:\n{personal_notes}\n\n"
"QUESTION:\n{question}<end_of_turn>\n"
"<start_of_turn>model "
),
"llama": (
"<|start_header_id|>system<|end_header_id|>\n"
"You are a compassionate friend sharing wisdom from ONLY THE CONTEXT and my PERSONAL NOTES.\n\n"
"Rules:\n"
"1. Explain themes and connect passages.\n"
"2. Synthesize BIBLE CONTEXT with USER'S PERSONAL NOTES.\n"
"3. Cite references like (John 14:27).\n"
"4. Long-form, reflective essay style.\n"
"5. Conversational, heart-to-heart, respectful tone.\n"
"6. No lists. No 'therefore'. No 'Amen'. No repetition.\n"
"7. Stay within 768 tokens.\n\n"
"<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
"CONTEXT:\n{context}\n\n"
"QUESTION:\n{question}\n\n"
"<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
)
}
print(f"Using prompt template for: {self.model_type}")
return ChatPromptTemplate.from_template(templates.get(self.model_type))
def _expand_and_merge_context(self, docs: List[Dict[str, Any]], window: int = 3) -> str:
# Expand main verse to surrounding +/- 3 verses for contextural understanding / theme
if not docs:
return "No relevant verses found."
# 1. Expand range of main verse to surrounding verses
ranges = []
for doc in docs:
idx = doc["index"]
start = max(0, idx - window)
end = min(len(self.bible_data) - 1, idx + window)
ranges.append([start, end, doc])
# 2. Merge overlapping intervals
ranges.sort(key=lambda x: x[0])
merged = []
for current in ranges:
if not merged or merged[-1][1] < current[0] - 1:
merged.append(current)
else:
merged[-1][1] = max(merged[-1][1], current[1])
curr_score = current[2].get("cross_score", current[2].get("bi_score", 0))
prev_score = merged[-1][2].get("cross_score", merged[-1][2].get("bi_score", 0))
if curr_score > prev_score:
merged[-1][2] = current[2]
# 3. Format context and visual display to user (Hybrid Approach)
context_parts = []
for start, end, orig_doc in merged:
passage = []
meta_orig = orig_doc.get("metadata", {})
# Anchor the LLM with the "Main Verse"
ref = f"{meta_orig.get('book')} {meta_orig.get('chapter')}:{meta_orig.get('verse')}"
passage.append(f"** Main verse: {ref} **")
current_book, current_chapter = None, None
for i in range(start, end + 1):
v_meta = self.bible_data[i].get("metadata", {})
book, chap, vnum = v_meta.get("book", "Unknown"), v_meta.get("chapter", "?"), v_meta.get("verse", "?")
text = v_meta.get("text_only", "").strip()
if book != current_book or chap != current_chapter:
passage.append(f"\n[{book} {chap}]")
current_book, current_chapter = book, chap
passage.append(f"{vnum}: {text}")
context_parts.append("\n".join(passage))
return "\n\n----- \n\n".join(context_parts)
@staticmethod
def format_notes_for_llm(notes_list: List[Dict[str, Any]]) -> str:
formatted_notes = []
for note in notes_list:
parts = note['notes_content'].split(" | ")
date = parts[0].replace("Date: ", "").strip()
content = parts[1].replace("Content: ", "").strip()
verses = parts[2].replace("Verses: ", "").strip()
formatted_notes.append(f"On {date}, I reflected: '{content}' (Related to: {verses})")
return "\n\n".join(formatted_notes)
def query(self, question: str, b_k: int = 7, n_k: int = 3) -> str:
if not self.llm:
self.initialize_llm()
bible_docs = self.retrieve_bible(query=question, k=b_k)
notes_docs = self.retrieve_notes(query=question, k_notes=n_k)
formatted_bible = self._expand_and_merge_context(bible_docs)
formatted_notes = self.format_notes_for_llm(notes_docs)
# Bible verses and Personal Notes retrieved via Bi / Cross Encoder interesting for personal study
print(f"Here is the Bible context:\n{formatted_bible}\n")
print(f"Here are the Personal Notes:\n{formatted_notes}\n")
print(f"Bible context extracted.\nPersonal notes extracted.\nReasoning for: '{question}'...\n")
prompt = self._create_prompt_template()
chain = prompt | self.llm | StrOutputParser()
return chain.invoke({
"context": formatted_bible,
"personal_notes": formatted_notes,
"question": question
})
class NotesManager:
def __init__(self, db_path: str = "E:/Python/bible/sovereign_notes.db"):
self.db_path = db_path
self._init_db()
def _init_db(self):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS personal_notes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT,
content TEXT,
tags TEXT,
associated_verses TEXT
)
''')
conn.commit()
def add_note(self, content: str, tags: str = "", verses: str = ""):
'''
Saves a new personal note.
Example:
# Add a sample note about peace to see if the RAG can find it later
# Date added automatically
notes_db.add_note(
content="I realized today that peace isn't the absence of conflict, but the
presence of God in it. Like Jesus sleeping in the boat during the storm.",
tags="peace, trials, personal-reflection",
verses="Mark 4:38"
)
'''
date_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO personal_notes (date, content, tags, associated_verses) VALUES (?, ?, ?, ?)",
(date_str, content, tags, verses)
)
conn.commit()
print(f"Note saved successfully at {date_str}")
def get_all_notes(self) -> List[Any]:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT content, date, tags, associated_verses FROM personal_notes")
return cursor.fetchall()
if __name__ == "__main__":
# Example usage
rag_system = BibleRAG()
notes_db = NotesManager()
rag_system.index_notes(notes_db)
query_str = "What does the Bible and my personal notes say about finding peace in trials?"
response_text = rag_system.query(query_str)
print("\n=== Response ===\n")
print(response_text)