PDF + FAISS 사용하기

실행 준비

!pip install openai
!pip install gradio
!pip install faiss-cpu sentence-transformers
!pip install pdfplumber

PDF + FAISS + KoSBERT 전체 예제 코드

import faiss
import numpy as np
import pdfplumber
from sentence_transformers import SentenceTransformer
from typing import List
import re

# 1. ✅ PDF에서 텍스트 추출 함수
def extract_text_from_pdf(file_path: str) -> str:
    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"
    return full_text

# 2. ✅ 문장 나누기 (간단한 마침표 + 줄바꿈 기준)
def split_into_sentences(text: str) -> List[str]:
    sentences = re.split(r"(?<=[\.\?\!\n])\s+", text)
    return [s.strip() for s in sentences if len(s.strip()) > 0]

# 3. ✅ 한국어 전용 KoSBERT 모델 로딩
model = SentenceTransformer("snunlp/KR-SBERT-V40K-klueNLI-augSTS")

# 4. ✅ PDF 파일 경로 (예: /content/sample.pdf)
pdf_path = "/content/sample.pdf"  # <-- 여기에 본인의 PDF 경로 입력

# 5. ✅ 텍스트 추출 및 문장화
pdf_text = extract_text_from_pdf(pdf_path)
sentences = split_into_sentences(pdf_text)

# 6. ✅ 문장 임베딩
sentence_embeddings = model.encode(sentences, convert_to_numpy=True).astype("float32")

# 7. ✅ FAISS 인덱스 구성
dimension = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(sentence_embeddings)

# 8. ✅ 사용자 쿼리 입력
query = "이 문서에서 FAISS에 대한 설명이 있는 부분을 알려줘"
query_embedding = model.encode([query], convert_to_numpy=True).astype("float32")

# 9. ✅ 유사 문장 검색
k = 5
distances, indices = index.search(query_embedding, k)
retrieved_sentences = [sentences[i] for i in indices[0]]

# 10. ✅ ChatGPT에 보낼 프롬프트 구성
context = "\n".join(retrieved_sentences)
final_prompt = f"""다음 문맥을 참고해서 질문에 답해줘:

[문맥]
{context}

[질문]
{query}
"""

# ✅ 결과 출력
print("📌 ChatGPT에 보낼 최종 프롬프트:\n")
print(final_prompt)

PDF 파일 업로드 (Colab에서 사용 시)

Colab에서 직접 PDF 파일을 업로드하려면 아래 코드를 추가로 실행하세요:

from google.colab import files
uploaded = files.upload()  # 파일 업로드 → 이후 /content/파일명.pdf 경로 사용