cookidooAI/CheckDB.py

import os
import random
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# --- KONFIGURACJA ---
DB_DIR = "/home/pali112/db_build123d_reference"
# --------------------

def inspect_database():
    if not os.path.exists(DB_DIR):
        print(f"❌ Błąd: Nie znaleziono katalogu bazy: {DB_DIR}")
        return

    print(f"📂 Otwieram bazę z: {DB_DIR} ...")

    # Musimy załadować embeddings, mimo że tylko czytamy tekst (wymóg Chroma w LangChain)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

    # Pobieramy wszystkie dane (bez wektorów, same metadane i tekst)
    # db.get() zwraca słownik: {'ids': [], 'embeddings': None, 'documents': [], 'metadatas': []}
    data = db.get()

    ids = data['ids']
    metadatas = data['metadatas']
    documents = data['documents']

    count = len(ids)

    if count == 0:
        print("⚠️  Baza jest PUSTA!")
        return

    print(f"\n📊 STATYSTYKI BAZY:")
    print(f"   Liczba chunków (fragmentów): {count}")

    # --- ANALIZA ROZDZIAŁÓW ---
    # Zbieramy unikalne pary header_1 > header_2, żeby sprawdzić filtrowanie
    unique_sections = set()
    for meta in metadatas:
        h1 = meta.get('header_1', 'N/A')
        h2 = meta.get('header_2', 'N/A')
        unique_sections.add(f"{h1} > {h2}")

    print("\n🔍 ZAWARTE SEKCJE (Sprawdź czy to tylko te, które chciałeś):")
    for section in sorted(unique_sections):
        print(f"   ✅ {section}")

    # --- PODGLĄD TREŚCI ---
    print("\n📝 PRZYKŁADOWE FRAGMENTY (Losowe 3):")

    # Losujemy 3 indeksy (lub mniej, jeśli baza jest mała)
    sample_indices = random.sample(range(count), min(3, count))

    for i in sample_indices:
        print("-" * 50)
        print(f"ID: {ids[i]}")
        print(f"METADATA: {metadatas[i]}")
        print(f"TREŚĆ (pierwsze 300 znaków):\n")
        print(documents[i][:300] + "...")
        print("-" * 50)

if __name__ == "__main__":
    inspect_database()