1. 개요
- RAG system을 구축하려고 할 때 PDF 문서의 형태가 아닐 수 있다.
- 예를 들어 csv, xlsx 형태일 수 있는데 이를 PDF docements로 만드는 방법.
2. 필수패키지
!pip install reportlab fpdf2
3. 코드
- 한글이기 때문에 encoding에 주의해야함
import pandas as pd
from langchain.schema import Document
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from reportlab.lib import fonts
import os
# Load the dataset
# file_path = 'data/disease.xlsx'
df = pd.read_excel("disease.xlsx")
# Clean the dataset (example: removing unwanted characters)
df = df.apply(lambda x: x.str.replace('_x000d_\n', '') if x.dtype == "object" else x)
# Create a list of Document objects
documents = [
Document(page_content=row['define'], metadata={
'disease_name': row['disease_name'],
'animal': row['animal'],
'cause': row['cause']
}) for _, row in df.iterrows()
]
# Register a font that supports euc-kr encoding
pdfmetrics.registerFont(TTFont('MalgunGothic', 'C:/Windows/Fonts/malgun.ttf')) # Update path based on your environment
# Create a PDF document
pdf_output_path = 'disease_documents.pdf'
c = canvas.Canvas(pdf_output_path, pagesize=A4)
# Define starting position
width, height = A4
y_position = height - 40
# Set the font
c.setFont("MalgunGothic", 12)
# Iterate over the documents and add them to the PDF
for document in documents:
# Add disease name as a heading
c.drawString(40, y_position, f"Disease Name: {document.metadata['disease_name']}")
y_position -= 20
# Add animal
c.drawString(40, y_position, f"Animal: {document.metadata['animal']}")
y_position -= 20
# Add cause
c.drawString(40, y_position, f"Cause: {document.metadata['cause']}")
y_position -= 20
# Add the content
for line in document.page_content.splitlines():
c.drawString(40, y_position, line)
y_position -= 20
if y_position < 40:
c.showPage() # Add new page if necessary
y_position = height - 40
c.setFont("MalgunGothic", 12)
# Add a line break
y_position -= 20
if y_position < 40:
c.showPage()
y_position = height - 40
c.setFont("MalgunGothic", 12)
# Save the PDF
c.save()
print(f"PDF saved to {pdf_output_path}")
4. 예제 파일