Initial commit of document-service
This commit is contained in:
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
31
app/config.py
Normal file
31
app/config.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# S3 settings
|
||||
s3_endpoint: str = "http://localhost:9000"
|
||||
s3_access_key: str = "minioadmin"
|
||||
s3_secret_key: str = "minioadmin"
|
||||
s3_bucket: str = "document-bucket"
|
||||
s3_region: str = "us-east-1"
|
||||
|
||||
# Service settings
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8082
|
||||
|
||||
# File size limits (bytes)
|
||||
max_file_size_pdf: int = 50 * 1024 * 1024 # 50MB
|
||||
max_file_size_docx: int = 25 * 1024 * 1024 # 25MB
|
||||
max_file_size_xlsx: int = 25 * 1024 * 1024 # 25MB
|
||||
max_file_size_jpg: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_jpeg: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_png: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_gif: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_default: int = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
# Logging
|
||||
log_level: str = "INFO"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
||||
settings = Settings()
|
||||
38
app/enums.py
Normal file
38
app/enums.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from enum import Enum
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
PDF = "pdf"
|
||||
DOCX = "docx"
|
||||
XLSX = "xlsx"
|
||||
JPG = "jpg"
|
||||
JPEG = "jpeg"
|
||||
PNG = "png"
|
||||
GIF = "gif"
|
||||
|
||||
@classmethod
|
||||
def from_mime_type(cls, mime_type: str) -> "DocumentType":
|
||||
"""Map MIME type to DocumentType"""
|
||||
mapping = {
|
||||
"application/pdf": cls.PDF,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": cls.DOCX,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": cls.XLSX,
|
||||
"image/jpeg": cls.JPG,
|
||||
"image/png": cls.PNG,
|
||||
"image/gif": cls.GIF,
|
||||
}
|
||||
return mapping.get(mime_type.lower())
|
||||
|
||||
@classmethod
|
||||
def from_extension(cls, filename: str) -> "DocumentType":
|
||||
"""Map file extension to DocumentType"""
|
||||
ext = filename.split(".")[-1].lower()
|
||||
mapping = {
|
||||
"pdf": cls.PDF,
|
||||
"docx": cls.DOCX,
|
||||
"xlsx": cls.XLSX,
|
||||
"jpg": cls.JPG,
|
||||
"jpeg": cls.JPEG,
|
||||
"png": cls.PNG,
|
||||
"gif": cls.GIF,
|
||||
}
|
||||
return mapping.get(ext)
|
||||
13
app/logger.py
Normal file
13
app/logger.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import logging
|
||||
from app.config import settings
|
||||
|
||||
def setup_logging():
|
||||
"""Setup logging configuration"""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, settings.log_level.upper()),
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""Get logger with specified name"""
|
||||
return logging.getLogger(name)
|
||||
82
app/main.py
Normal file
82
app/main.py
Normal file
@@ -0,0 +1,82 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.openapi.utils import get_openapi
|
||||
from app.routers import documents
|
||||
from app.config import settings
|
||||
from app.logger import setup_logging
|
||||
from app.middleware.auth import AuthMiddleware
|
||||
|
||||
# Setup logging
|
||||
setup_logging()
|
||||
|
||||
app = FastAPI(
|
||||
title="Document Service",
|
||||
version="1.0.0",
|
||||
description="Generic document management service with S3 storage and PDF field discovery",
|
||||
openapi_url="/openapi3.json",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc"
|
||||
)
|
||||
|
||||
# Add auth middleware
|
||||
app.add_middleware(AuthMiddleware)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["http://localhost:3000"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
app.include_router(documents.router)
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Run startup tasks.
|
||||
|
||||
Raises:
|
||||
Exception: If S3 bucket initialization fails (service will fail to start)
|
||||
"""
|
||||
from app import s3
|
||||
from app.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger.info("Starting up document service...")
|
||||
|
||||
try:
|
||||
s3.ensure_bucket_exists()
|
||||
logger.info("S3 bucket initialization complete")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize S3 bucket: {e}")
|
||||
# Re-raise to fail startup
|
||||
raise
|
||||
|
||||
@app.get("/health", tags=["health"])
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.get("/health/ready", tags=["health"])
|
||||
def health_ready():
|
||||
"""Health check for Kubernetes readiness probes."""
|
||||
return {"status": "ready"}
|
||||
|
||||
def custom_openapi():
|
||||
if app.openapi_schema:
|
||||
return app.openapi_schema
|
||||
|
||||
schema = get_openapi(
|
||||
title="Document Service",
|
||||
version="1.0.0",
|
||||
openapi_version="3.1.0",
|
||||
description="Generic document management service with S3 storage and PDF field discovery",
|
||||
routes=app.routes
|
||||
)
|
||||
|
||||
schema["servers"] = [
|
||||
{"url": "http://localhost:8082", "description": "Local dev"}
|
||||
]
|
||||
|
||||
app.openapi_schema = schema
|
||||
return app.openapi_schema
|
||||
|
||||
app.openapi = custom_openapi
|
||||
0
app/middleware/__init__.py
Normal file
0
app/middleware/__init__.py
Normal file
16
app/middleware/auth.py
Normal file
16
app/middleware/auth.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from fastapi import Request
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.responses import JSONResponse
|
||||
from app.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class AuthMiddleware(BaseHTTPMiddleware):
|
||||
async def dispatch(self, request: Request, call_next):
|
||||
# Skip auth for health endpoint
|
||||
if request.url.path == "/health":
|
||||
return await call_next(request)
|
||||
request.state.org_id = "test"
|
||||
response = await call_next(request)
|
||||
return response
|
||||
|
||||
30
app/models.py
Normal file
30
app/models.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from app.enums import DocumentType
|
||||
|
||||
class DocumentMetadata(BaseModel):
|
||||
document_id: str = Field(..., description="UUID of the document")
|
||||
org_id: str = Field(..., description="Organization ID")
|
||||
document_type: DocumentType = Field(..., description="Type of document")
|
||||
filename: str = Field(..., description="Original filename")
|
||||
content_type: str = Field(..., description="MIME type")
|
||||
file_size: int = Field(..., description="File size in bytes")
|
||||
s3_key: str = Field(..., description="S3 key for the document")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
class UploadResponse(BaseModel):
|
||||
document_id: str
|
||||
metadata: DocumentMetadata
|
||||
download_url: str
|
||||
|
||||
class DownloadUrlResponse(BaseModel):
|
||||
download_url: str
|
||||
s3_key: str
|
||||
expires_in: int
|
||||
|
||||
class FieldsResponse(BaseModel):
|
||||
document_id: str
|
||||
document_type: DocumentType
|
||||
fields: list[dict]
|
||||
105
app/pdf.py
Normal file
105
app/pdf.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import os
|
||||
from pypdf import PdfReader
|
||||
from typing import Any
|
||||
|
||||
def discover_fields(pdf_path: str) -> list[dict]:
|
||||
"""
|
||||
Introspect a PDF and return all fillable AcroForm fields.
|
||||
Handles any form of AcroForm structure.
|
||||
"""
|
||||
reader = PdfReader(pdf_path)
|
||||
|
||||
# Try multiple methods to get fields
|
||||
fields = None
|
||||
|
||||
# Method 1: Try get_fields() first
|
||||
try:
|
||||
fields = reader.get_fields()
|
||||
except Exception as e:
|
||||
print(f"get_fields() failed: {e}")
|
||||
fields = None
|
||||
|
||||
# Method 2: Try to get fields from AcroForm directly
|
||||
if not fields:
|
||||
try:
|
||||
if "/AcroForm" in reader.trailer["/Root"]:
|
||||
acroform = reader.trailer["/Root"]["/AcroForm"]
|
||||
if "/Fields" in acroform:
|
||||
fields = {}
|
||||
field_array = acroform["/Fields"]
|
||||
for field_ref in field_array:
|
||||
try:
|
||||
field_obj = field_ref.get_object()
|
||||
field_name = field_obj.get("/T", "")
|
||||
if field_name:
|
||||
fields[field_name] = field_obj
|
||||
except Exception as e:
|
||||
print(f"Error processing field: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Direct AcroForm access failed: {e}")
|
||||
fields = None
|
||||
|
||||
# Method 3: Try to get fields from page annotations
|
||||
if not fields:
|
||||
try:
|
||||
fields = {}
|
||||
for page in reader.pages:
|
||||
if "/Annots" in page:
|
||||
for annot in page["/Annots"]:
|
||||
try:
|
||||
annot_obj = annot.get_object()
|
||||
if "/Subtype" in annot_obj and annot_obj["/Subtype"] == "/Widget":
|
||||
field_name = annot_obj.get("/T", "")
|
||||
if field_name and field_name not in fields:
|
||||
fields[field_name] = annot_obj
|
||||
except Exception as e:
|
||||
print(f"Error processing annotation: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Page annotation access failed: {e}")
|
||||
fields = None
|
||||
|
||||
if not fields:
|
||||
return []
|
||||
|
||||
result = []
|
||||
for field_name, field_obj in fields.items():
|
||||
try:
|
||||
field_type = field_obj.get("/FT", "")
|
||||
options = []
|
||||
|
||||
# /Ch = choice field (select/dropdown)
|
||||
if field_type == "/Ch":
|
||||
opt = field_obj.get("/Opt", [])
|
||||
if opt:
|
||||
options = [o if isinstance(o, str) else o[1] for o in opt]
|
||||
|
||||
result.append({
|
||||
"field": field_name,
|
||||
"label": field_name.replace("_", " ").title(),
|
||||
"type": _map_field_type(field_type, field_obj),
|
||||
"required": False,
|
||||
"options": options if options else None
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error processing field {field_name}: {e}")
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
def _map_field_type(ft: str, field_obj: dict) -> str:
|
||||
mapping = {
|
||||
"/Tx": "string",
|
||||
"/Btn": "boolean",
|
||||
"/Ch": "select",
|
||||
"/Sig": "string"
|
||||
}
|
||||
base = mapping.get(ft, "string")
|
||||
|
||||
# Check if it's a date field by name hint
|
||||
field_name = field_obj.get("/T", "").lower()
|
||||
if any(hint in field_name for hint in ["date", "fecha", "birth", "nacimiento"]):
|
||||
return "date"
|
||||
|
||||
return base
|
||||
1
app/routers/__init__.py
Normal file
1
app/routers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from app.routers import documents
|
||||
355
app/routers/documents.py
Normal file
355
app/routers/documents.py
Normal file
@@ -0,0 +1,355 @@
|
||||
import os
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from app import s3, pdf, utils
|
||||
from app.config import settings
|
||||
from app.enums import DocumentType
|
||||
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
||||
from app.logger import get_logger
|
||||
|
||||
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@router.post("/upload", response_model=UploadResponse)
|
||||
async def upload_document(
|
||||
request: Request,
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""Upload a new document"""
|
||||
org_id = request.state.org_id
|
||||
user_id = getattr(request.state, "user_id", "system")
|
||||
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
|
||||
|
||||
# Detect content type
|
||||
detected_content_type = utils.detect_content_type(file)
|
||||
logger.info(f"Detected content type: {detected_content_type}")
|
||||
|
||||
# Detect document type
|
||||
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||
if not document_type:
|
||||
logger.error(f"Unsupported document type: {file.filename}")
|
||||
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||
|
||||
# Get file size
|
||||
file.file.seek(0, os.SEEK_END)
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0)
|
||||
|
||||
# Validate file size
|
||||
utils.validate_file_size(file_size, document_type)
|
||||
|
||||
# Generate document ID and S3 key
|
||||
document_id = utils.generate_document_id()
|
||||
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||
|
||||
# Prepare metadata
|
||||
metadata_dict = {
|
||||
"org_id": org_id,
|
||||
"document_type": document_type.value,
|
||||
"filename": file.filename,
|
||||
"file_size": str(file_size),
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Upload to S3
|
||||
try:
|
||||
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||
logger.info(f"File uploaded successfully: {s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key)
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=org_id,
|
||||
document_type=document_type,
|
||||
filename=file.filename,
|
||||
content_type=detected_content_type,
|
||||
file_size=file_size,
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.utcnow(),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
logger.info(f"Upload completed - document_id: {document_id}")
|
||||
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||
|
||||
@router.put("/{document_id}", response_model=UploadResponse)
|
||||
async def rewrite_document(
|
||||
request: Request,
|
||||
document_id: str,
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""Rewrite/replace an existing document"""
|
||||
org_id = request.state.org_id
|
||||
user_id = getattr(request.state, "user_id", "system")
|
||||
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
|
||||
|
||||
# Detect content type
|
||||
detected_content_type = utils.detect_content_type(file)
|
||||
|
||||
# Detect document type
|
||||
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||
if not document_type:
|
||||
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||
|
||||
# Get file size
|
||||
file.file.seek(0, os.SEEK_END)
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0)
|
||||
|
||||
# Validate file size
|
||||
utils.validate_file_size(file_size, document_type)
|
||||
|
||||
# Generate S3 key
|
||||
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||
|
||||
# Check if document exists
|
||||
if not s3.file_exists(s3_key):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Verify org_id matches
|
||||
existing_metadata = s3.get_file_metadata(s3_key)
|
||||
if existing_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Prepare metadata
|
||||
metadata_dict = {
|
||||
"org_id": org_id,
|
||||
"document_type": document_type.value,
|
||||
"filename": file.filename,
|
||||
"file_size": str(file_size),
|
||||
"updated_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Upload to S3 (overwrites existing)
|
||||
try:
|
||||
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||
logger.info(f"File rewritten successfully: {s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rewrite file: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key)
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=org_id,
|
||||
document_type=document_type,
|
||||
filename=file.filename,
|
||||
content_type=detected_content_type,
|
||||
file_size=file_size,
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
logger.info(f"Rewrite completed - document_id: {document_id}")
|
||||
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||
|
||||
@router.get("/{document_id}", response_model=DocumentMetadata)
|
||||
async def get_document(request: Request, document_id: str):
|
||||
"""Get document metadata"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Get metadata from S3
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
|
||||
# Verify org_id matches
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Get object info
|
||||
try:
|
||||
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get object info: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=s3_metadata.get("org_id"),
|
||||
document_type=DocumentType(s3_metadata.get("document_type")),
|
||||
filename=s3_metadata.get("filename"),
|
||||
content_type=object_info.get("ContentType"),
|
||||
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
|
||||
)
|
||||
|
||||
logger.info(f"Get document completed - document_id: {document_id}")
|
||||
return metadata
|
||||
|
||||
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
|
||||
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
|
||||
"""Get presigned download URL"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Verify org_id matches
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key, expires_in)
|
||||
|
||||
logger.info(f"Download URL generated - document_id: {document_id}")
|
||||
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
|
||||
|
||||
@router.get("/{document_id}/fields", response_model=FieldsResponse)
|
||||
async def get_document_fields(request: Request, document_id: str):
|
||||
"""Get PDF form fields (PDF only)"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Get metadata
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
|
||||
# Verify org_id matches
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Check if PDF
|
||||
document_type = s3_metadata.get("document_type")
|
||||
if document_type != DocumentType.PDF.value:
|
||||
logger.error(f"Document is not PDF: {document_type}")
|
||||
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
||||
|
||||
# Download and discover fields
|
||||
try:
|
||||
pdf_path = s3.download_to_temp(s3_key)
|
||||
fields = pdf.discover_fields(pdf_path)
|
||||
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to discover fields: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
|
||||
finally:
|
||||
if os.path.exists(pdf_path):
|
||||
os.unlink(pdf_path)
|
||||
|
||||
return FieldsResponse(
|
||||
document_id=document_id,
|
||||
document_type=DocumentType.PDF,
|
||||
fields=fields
|
||||
)
|
||||
|
||||
@router.delete("/{document_id}")
|
||||
async def delete_document(request: Request, document_id: str):
|
||||
"""Delete document"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Verify org_id matches
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Delete from S3
|
||||
try:
|
||||
s3.delete_file(s3_key)
|
||||
logger.info(f"Document deleted - document_id: {document_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete document: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
||||
|
||||
return {"message": "Document deleted successfully"}
|
||||
101
app/s3.py
Normal file
101
app/s3.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import boto3
|
||||
import tempfile
|
||||
import os
|
||||
from botocore.client import Config
|
||||
from fastapi import UploadFile
|
||||
from app.config import settings
|
||||
from app.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def get_client():
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=settings.s3_endpoint,
|
||||
aws_access_key_id=settings.s3_access_key,
|
||||
aws_secret_access_key=settings.s3_secret_key,
|
||||
config=Config(signature_version="s3v4"),
|
||||
region_name=settings.s3_region
|
||||
)
|
||||
|
||||
def ensure_bucket_exists() -> None:
|
||||
"""Ensure the S3 bucket exists, create it if it doesn't exist.
|
||||
|
||||
Raises:
|
||||
Exception: If bucket creation fails (service will fail to start)
|
||||
"""
|
||||
client = get_client()
|
||||
try:
|
||||
client.head_bucket(Bucket=settings.s3_bucket)
|
||||
logger.info(f"Bucket '{settings.s3_bucket}' already exists")
|
||||
except client.exceptions.ClientError as e:
|
||||
error_code = e.response['Error']['Code']
|
||||
if error_code == '404':
|
||||
try:
|
||||
client.create_bucket(
|
||||
Bucket=settings.s3_bucket,
|
||||
CreateBucketConfiguration={
|
||||
'LocationConstraint': settings.s3_region
|
||||
}
|
||||
)
|
||||
logger.info(f"Created bucket '{settings.s3_bucket}'")
|
||||
except Exception as create_error:
|
||||
logger.error(f"Failed to create bucket '{settings.s3_bucket}': {create_error}")
|
||||
raise
|
||||
else:
|
||||
logger.error(f"Error checking bucket: {e}")
|
||||
raise
|
||||
|
||||
def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
|
||||
"""Upload file to S3 with metadata"""
|
||||
client = get_client()
|
||||
|
||||
extra_args = {"ContentType": content_type}
|
||||
if metadata:
|
||||
extra_args["Metadata"] = metadata
|
||||
|
||||
client.upload_fileobj(
|
||||
file.file,
|
||||
settings.s3_bucket,
|
||||
s3_key,
|
||||
ExtraArgs=extra_args
|
||||
)
|
||||
return s3_key
|
||||
|
||||
def delete_file(s3_key: str) -> None:
|
||||
"""Delete file from S3"""
|
||||
client = get_client()
|
||||
client.delete_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
|
||||
def file_exists(s3_key: str) -> bool:
|
||||
"""Check if file exists in S3"""
|
||||
client = get_client()
|
||||
try:
|
||||
client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
return True
|
||||
except client.exceptions.ClientError:
|
||||
return False
|
||||
|
||||
def get_file_metadata(s3_key: str) -> dict:
|
||||
"""Get file metadata from S3"""
|
||||
client = get_client()
|
||||
response = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
return response.get("Metadata", {})
|
||||
|
||||
def download_to_temp(s3_key: str) -> str:
|
||||
"""Download file from S3 to temp file"""
|
||||
client = get_client()
|
||||
suffix = os.path.splitext(s3_key)[-1] or ".tmp"
|
||||
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
||||
client.download_fileobj(settings.s3_bucket, s3_key, tmp)
|
||||
tmp.close()
|
||||
return tmp.name
|
||||
|
||||
def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
|
||||
"""Generate presigned download URL"""
|
||||
client = get_client()
|
||||
return client.generate_presigned_url(
|
||||
"get_object",
|
||||
Params={"Bucket": settings.s3_bucket, "Key": s3_key},
|
||||
ExpiresIn=expires_in
|
||||
)
|
||||
66
app/utils.py
Normal file
66
app/utils.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import uuid
|
||||
import magic
|
||||
from fastapi import HTTPException, UploadFile
|
||||
from app.config import settings
|
||||
from app.enums import DocumentType
|
||||
|
||||
def generate_document_id() -> str:
|
||||
"""Generate UUID for document"""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
def s3_path_prefix(org_id: str, document_id: str) -> str:
|
||||
"""Generate S3 path prefix for document operations"""
|
||||
return f"documents/{org_id}/{document_id}/"
|
||||
|
||||
def detect_content_type(file: UploadFile) -> str:
|
||||
"""Detect content type using python-magic"""
|
||||
file.file.seek(0)
|
||||
content = file.file.read(2048)
|
||||
file.file.seek(0)
|
||||
|
||||
mime = magic.Magic(mime=True)
|
||||
return mime.from_buffer(content)
|
||||
|
||||
def detect_document_type(filename: str, content_type: str) -> DocumentType:
|
||||
"""Detect document type from filename and content type"""
|
||||
# Try content type first
|
||||
doc_type = DocumentType.from_mime_type(content_type)
|
||||
if doc_type:
|
||||
return doc_type
|
||||
|
||||
# Fall back to extension
|
||||
return DocumentType.from_extension(filename)
|
||||
|
||||
def get_file_size_limit(document_type: DocumentType) -> int:
|
||||
"""Get max file size for document type"""
|
||||
limits = {
|
||||
DocumentType.PDF: settings.max_file_size_pdf,
|
||||
DocumentType.DOCX: settings.max_file_size_docx,
|
||||
DocumentType.XLSX: settings.max_file_size_xlsx,
|
||||
DocumentType.JPG: settings.max_file_size_jpg,
|
||||
DocumentType.JPEG: settings.max_file_size_jpeg,
|
||||
DocumentType.PNG: settings.max_file_size_png,
|
||||
DocumentType.GIF: settings.max_file_size_gif,
|
||||
}
|
||||
return limits.get(document_type, settings.max_file_size_default)
|
||||
|
||||
def validate_file_size(file_size: int, document_type: DocumentType) -> None:
|
||||
"""Validate file size against limits"""
|
||||
max_size = get_file_size_limit(document_type)
|
||||
if file_size > max_size:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File size {file_size} exceeds maximum {max_size} for {document_type.value}"
|
||||
)
|
||||
|
||||
def document_s3_key(org_id: str, document_id: str, filename: str) -> str:
|
||||
"""Generate S3 key for document"""
|
||||
return f"{s3_path_prefix(org_id, document_id)}{filename}"
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""Sanitize filename for S3"""
|
||||
# Remove path separators and special characters
|
||||
filename = filename.replace("/", "_").replace("\\", "_")
|
||||
# Keep only safe characters
|
||||
safe_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_")
|
||||
return "".join(c for c in filename if c in safe_chars)
|
||||
Reference in New Issue
Block a user