Kss + 청크로 문단 나누기

실행 준비

!pip install openai
!pip install gradio
!pip install faiss-cpu sentence-transformers
!pip install pdfplumber
!pip install tiktoken

문장 분리 방법의 비교

항목	kss + 청크 조합	LangChain 분리기
한글 문장 유지	✅ 매우 좋음	❌ 보장 안 됨 (문장 중간 자를 수 있음)
설정 유연성	✅ 높음 (직접 구현)	✅ 간편 설정
성능/속도	빠름	약간 느릴 수 있음
권장 대상	한국어 중심 문서	로컬/다양한 구조 문서

문장 분리 방법

청크당 300토큰 기준
50토큰 겹침(sliding window)

kss + 청크 조합 방식 (직접 구현)

import kss
import pdfplumber
import tiktoken
from typing import List

# ✅ GPT-3.5/4 기준 토크나이저 사용 (tiktoken)
tokenizer = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    return len(tokenizer.encode(string))

# ✅ PDF 텍스트 추출
def extract_text_from_pdf(file_path: str) -> str:
    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"
    return full_text

# ✅ 문장 단위로 분리 (kss)
def split_sentences(text: str) -> List[str]:
    return kss.split_sentences(text)

# ✅ 청크 생성 (300토큰 기준, 50토큰 겹침)
def chunk_sentences_by_token(sentences: List[str], chunk_size: int = 300, overlap: int = 50) -> List[str]:
    chunks = []
    current_chunk = []
    current_tokens = 0
    i = 0

    while i < len(sentences):
        sentence = sentences[i]
        token_len = num_tokens_from_string(sentence)

        if current_tokens + token_len > chunk_size:
            chunks.append(" ".join(current_chunk))
            # 겹침 적용
            if overlap > 0:
                overlap_tokens = 0
                overlap_chunk = []
                j = len(current_chunk) - 1
                while j >= 0 and overlap_tokens < overlap:
                    t = num_tokens_from_string(current_chunk[j])
                    overlap_tokens += t
                    overlap_chunk.insert(0, current_chunk[j])
                    j -= 1
                current_chunk = overlap_chunk
                current_tokens = sum(num_tokens_from_string(s) for s in current_chunk)
            else:
                current_chunk = []
                current_tokens = 0
        else:
            current_chunk.append(sentence)
            current_tokens += token_len
            i += 1

    # 마지막 청크 추가
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# ✅ 예제 실행
pdf_path = "/content/sample.pdf"  # 자신의 PDF 파일 경로
text = extract_text_from_pdf(pdf_path)
sentences = split_sentences(text)
chunks = chunk_sentences_by_token(sentences)

# ✅ 출력 확인
print(f"총 {len(chunks)}개의 청크 생성됨 (각 청크는 약 300토큰)")
print("\n🔹 첫 번째 청크 예시:\n", chunks[0])