Document Search Template
RAG + Vector Database template for intelligent document search
Quick Start
# Clone template git clone https://github.com/wearehybrid/search-template.git cd search-template # Install dependencies pip install -r requirements.txt # Set environment variables cp .env.example .env # Add your OPENAI_API_KEY and database credentials # Initialize vector database python scripts/setup_db.py # Index sample documents python scripts/index_documents.py --folder ./sample_docs # Start API server uvicorn main:app --reload
Architecture
📄 Documents
PDF, DOCX, TXT
🔍 Processing
Chunking & Embedding
🗄️ Vector DB
ChromaDB/Pinecone
🤖 RAG
Retrieval + Generation
Features
🔍 Search Capabilities
- • Semantic similarity search
- • Keyword + vector hybrid search
- • Multi-document search
- • Filtered search by metadata
- • Real-time indexing
📚 Document Processing
- • Multi-format support (PDF, DOCX, TXT)
- • Intelligent chunking strategies
- • Metadata extraction
- • OCR for scanned documents
- • Batch processing
Code Structure
search-app/ ├── src/ │ ├── processors/ │ │ ├── document_processor.py │ │ ├── chunking.py │ │ └── embeddings.py │ ├── search/ │ │ ├── vector_store.py │ │ ├── retriever.py │ │ └── rag_engine.py │ ├── api/ │ │ ├── search_api.py │ │ └── upload_api.py │ └── utils/ │ ├── file_utils.py │ └── text_utils.py ├── scripts/ │ ├── setup_db.py │ └── index_documents.py ├── frontend/ │ ├── components/ │ └── pages/ └── requirements.txt
Key Components
Document Processor
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
class DocumentProcessor:
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""]
)
def process_file(self, file_path):
# Load document based on file type
if file_path.endswith('.pdf'):
loader = PyPDFLoader(file_path)
elif file_path.endswith('.docx'):
loader = Docx2txtLoader(file_path)
else:
with open(file_path, 'r') as f:
content = f.read()
return self.text_splitter.split_text(content)
documents = loader.load()
chunks = self.text_splitter.split_documents(documents)
return chunksRAG Engine
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
class RAGEngine:
def __init__(self, persist_directory="./chroma_db"):
self.embeddings = OpenAIEmbeddings()
self.vectorstore = Chroma(
persist_directory=persist_directory,
embedding_function=self.embeddings
)
self.llm = OpenAI(temperature=0)
self.qa_chain = RetrievalQA.from_chain_type(
llm=self.llm,
chain_type="stuff",
retriever=self.vectorstore.as_retriever(
search_kwargs={"k": 5}
),
return_source_documents=True
)
def search(self, query):
result = self.qa_chain({"query": query})
return {
"answer": result["result"],
"sources": [doc.metadata for doc in result["source_documents"]]
}API Endpoints
Search API
POST /api/search
{
"query": "What is machine learning?",
"filters": {
"document_type": "pdf",
"date_range": "2023-01-01:2023-12-31"
},
"limit": 10
}
Response:
{
"answer": "Machine learning is...",
"sources": [
{
"document": "ml_guide.pdf",
"page": 5,
"score": 0.95
}
]
}Upload API
POST /api/upload
Content-Type: multipart/form-data
Response:
{
"status": "success",
"document_id": "doc_123",
"chunks_created": 25,
"processing_time": "2.3s"
}Vector Database Options
ChromaDB
Open-source, local-first
- • Easy setup
- • No external dependencies
- • Good for development
- • Built-in persistence
Pinecone
Managed vector database
- • High performance
- • Auto-scaling
- • Production-ready
- • Real-time updates
Weaviate
Open-source vector database
- • GraphQL API
- • Multi-modal support
- • Hybrid search
- • Self-hosted option
Frontend Integration
React Search Component
const SearchInterface = () => {
const [query, setQuery] = useState('')
const [results, setResults] = useState(null)
const [loading, setLoading] = useState(false)
const handleSearch = async () => {
setLoading(true)
try {
const response = await fetch('/api/search', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ query })
})
const data = await response.json()
setResults(data)
} finally {
setLoading(false)
}
}
return (
<div className="search-interface">
<input
value={query}
onChange={(e) => setQuery(e.target.value)}
placeholder="Ask a question about your documents..."
/>
<button onClick={handleSearch} disabled={loading}>
{loading ? 'Searching...' : 'Search'}
</button>
{results && <SearchResults results={results} />}
</div>
)
}Build Your Search App
Get started with our production-ready document search template