Initial commit of document-service

2026-04-23 16:20:58 -05:00
commit 51d60f0032
30 changed files with 4357 additions and 0 deletions
--- a/.envrc
+++ b/.envrc
@@ -0,0 +1 @@
 use flake
--- a/.gitea/workflows/build-and-publish.yaml
+++ b/.gitea/workflows/build-and-publish.yaml
@@ -0,0 +1,68 @@
 name: Build and Publish
 on:
  push:
    branches:
      - main
 env:
  CHART_NAME: ${{ github.event.repository.name }}
  IMAGE_NAME: ${{ github.event.repository.name }}
 jobs:
  build-release:
    runs-on: nix
    permissions:
      id-token: write
      contents: read
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Build Docker Image via Nix Flake
        run: |
          nix build .#dockerImage --print-build-logs
          docker load < result
      - name: Log in to Gitea Container Registry
        uses: docker/login-action@v3
        with:
          registry: ${{ github.server_url }}
          username: ${{ secrets.CI_USER }}
          password: ${{ secrets.CI_PASSWORD }}
      - name: Tag and Push Docker Image
        run: |
          VERSION=${{ github.run_number }}
          # Strip https from server URL
          REGISTRY=${GITHUB_SERVER_URL#https://}
          TARGET_IMAGE=$REGISTRY/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}
          # Auto-detect the built image name (better version)
          SOURCE_IMAGE=$(docker load < result | awk '{print $3}')
          docker tag $SOURCE_IMAGE $TARGET_IMAGE:$VERSION
          docker tag $SOURCE_IMAGE $TARGET_IMAGE:latest
          docker push $TARGET_IMAGE:$VERSION
          docker push $TARGET_IMAGE:latest
      - name: Setup Helm
        uses: azure/setup-helm@v4
        with:
          version: v3.14.0
      - name: Package Helm Chart
        run: |
          VERSION=${{ github.run_number }}
          helm repo add bjw-s https://bjw-s-labs.github.io/helm-charts
          helm dependency build ops/chart
          helm package ops/chart --version $VERSION --app-version $VERSION
      - name: Push Helm Chart to Gitea Registry
        run: |
          VERSION=${{ github.run_number }}
          CHART_FILE=${{ env.CHART_NAME }}-${VERSION}.tgz
          curl -f --user "${{ secrets.CI_USER }}:${{ secrets.CI_PASSWORD }}" \
               -X POST \
               --upload-file ./$CHART_FILE \
               "${{ github.server_url }}/api/packages/${{ github.repository_owner }}/helm/api/charts"
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,54 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual environments
 .venv/
 venv/
 ENV/
 env/
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # Environment variables
 .env.local
 .env.*.local
 # Logs
 *.log
 # OS
 .DS_Store
 Thumbs.db
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 # Nix
 .direnv/
 result
--- a/README.md
+++ b/README.md
@@ -0,0 +1,286 @@
 # Document Service
 Generic document management service with S3 storage and PDF field discovery.
 ## Features
 - **Multi-format support**: PDF, DOCX, XLSX, JPG, JPEG, PNG, GIF
 - **S3 storage**: Configurable S3-compatible storage (MinIO, AWS S3, etc.)
 - **PDF field discovery**: Extract form fields from PDF documents
 - **Organization-based access control**: Documents scoped to organizations
 - **File size limits**: Configurable per document type
 - **Content type detection**: Automatic detection using python-magic
 - **Comprehensive logging**: All operations logged for audit trail
 ## API Endpoints
 ### Upload Document
 ```
 POST /api/documents/upload
 Content-Type: multipart/form-data
 Authorization: Bearer <token>
 Form data:
 - file: (required) Document file
 - uploaded_by: (optional) User who uploaded the document
 Response:
 {
  "document_id": "uuid",
  "metadata": {...},
  "download_url": "presigned-url"
 }
 ```
 ### Rewrite Document
 ```
 PUT /api/documents/{document_id}
 Content-Type: multipart/form-data
 Authorization: Bearer <token>
 Form data:
 - file: (required) New document file
 - uploaded_by: (optional) User who uploaded the document
 Response:
 {
  "document_id": "uuid",
  "metadata": {...},
  "download_url": "presigned-url"
 }
 ```
 ### Get Document Metadata
 ```
 GET /api/documents/{document_id}
 Authorization: Bearer <token>
 Response:
 {
  "document_id": "uuid",
  "org_id": "org-id",
  "uploaded_by": "user",
  "document_type": "pdf",
  "filename": "document.pdf",
  "content_type": "application/pdf",
  "file_size": 12345,
  "s3_key": "documents/org-id/uuid/document.pdf",
  "created_at": "2024-01-01T00:00:00",
  "updated_at": "2024-01-01T00:00:00"
 }
 ```
 ### Get Download URL
 ```
 GET /api/documents/{document_id}/download-url?expires_in=3600
 Authorization: Bearer <token>
 Response:
 {
  "download_url": "presigned-url",
  "s3_key": "documents/org-id/uuid/document.pdf",
  "expires_in": 3600
 }
 ```
 ### Get PDF Fields
 ```
 GET /api/documents/{document_id}/fields
 Authorization: Bearer <token>
 Response:
 {
  "document_id": "uuid",
  "document_type": "pdf",
  "fields": [
    {
      "field": "field_name",
      "label": "Field Name",
      "type": "string",
      "required": false,
      "options": null
    }
  ]
 }
 ```
 ### Delete Document
 ```
 DELETE /api/documents/{document_id}
 Authorization: Bearer <token>
 Response:
 {
  "message": "Document deleted successfully"
 }
 ```
 ## Configuration
 ### Environment Variables
 | Variable | Description | Default |
 |----------|-------------|---------|
 | `S3_ENDPOINT` | S3 endpoint URL | `http://localhost:9000` |
 | `S3_ACCESS_KEY` | S3 access key | `minioadmin` |
 | `S3_SECRET_KEY` | S3 secret key | `minioadmin` |
 | `S3_BUCKET` | S3 bucket name | `document-bucket` |
 | `S3_REGION` | S3 region | `us-east-1` |
 | `HOST` | Service host | `0.0.0.0` |
 | `PORT` | Service port | `8082` |
 | `TEST_UPLOADER` | Default uploader for testing | `test-user` |
 | `LOG_LEVEL` | Logging level | `INFO` |
 ### File Size Limits
 | Document Type | Default Limit |
 |---------------|---------------|
 | PDF | 50MB |
 | DOCX | 25MB |
 | XLSX | 25MB |
 | JPG/JPEG | 10MB |
 | PNG | 10MB |
 | GIF | 10MB |
 | Other | 10MB |
 ## Authentication
 The service uses JWT tokens for authentication. The `org_id` is extracted from the token claims and used for organization-based access control.
 **Note**: Currently, the auth middleware includes a mock implementation for testing. In production, this should be replaced with proper Zitadel integration.
 ## Development
 ### Setup
 This project uses [uv2nix](https://pyproject-nix.github.io/uv2nix/) for reproducible Python dependency management with Nix.
 ```bash
 # Enter the development shell (uses uv2nix)
 nix develop
 # The development shell includes:
 # - Python with all dependencies from uv.lock
 # - uv tool for package management
 # - pyright for type checking
 # - file package (provides libmagic for content type detection)
 ```
 ### Running the Service
 ```bash
 # Start the development server
 uvicorn app.main:app --reload --host 0.0.0.0 --port 8082
 # Access API documentation
 open http://localhost:8082/docs
 ```
 ### Adding Dependencies
 ```bash
 # Add a new dependency
 uv add <package-name>
 # Add a development dependency
 uv add --dev <package-name>
 # Update the lock file
 uv lock
 ```
 ### Testing
 ```bash
 # Run tests
 pytest
 # Run with coverage
 pytest --cov=app
 ```
 ### Linting
 ```bash
 # Run ruff
 ruff check app/
 # Format code
 ruff format app/
 ```
 ### Building Production Package
 ```bash
 # Build the production package
 nix build
 # The package will be available at ./result
 ```
 ## Deployment
 ### Using Helm
 ```bash
 # Install chart
 helm install document-service ./ops/chart
 # Upgrade chart
 helm upgrade document-service ./ops/chart
 # Uninstall
 helm uninstall document-service
 ```
 ### Configuration
 Edit `ops/chart/values.yaml` to customize deployment settings.
 ## S3 Path Structure
 Documents are stored in S3 using the following path structure:
 ```
 documents/{org_id}/{document_id}/{filename}
 ```
 Example:
 ```
 documents/org-123/abc-456-def-789/policy_document.pdf
 ```
 ## Logging
 All operations are logged with the following information:
 - Operation type (upload, download, delete, etc.)
 - Document ID
 - Organization ID
 - User ID
 - Timestamp
 - Success/failure status
 ## Error Handling
 The service returns appropriate HTTP status codes:
 - `200` - Success
 - `201` - Created
 - `400` - Bad Request
 - `401` - Unauthorized
 - `403` - Forbidden
 - `404` - Not Found
 - `413` - Payload Too Large (file size exceeded)
 - `415` - Unsupported Media Type
 - `500` - Internal Server Error
 ## TODO
 - [ ] Implement proper Zitadel authentication
 - [ ] Add document listing endpoint
 - [ ] Add document search functionality
 - [ ] Add document versioning support
 - [ ] Add document conversion capabilities
 - [ ] Add comprehensive test coverage
 - [ ] Add API rate limiting
 - [ ] Add metrics and monitoring
--- a/app/init.py
+++ b/app/init.py
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,31 @@
 from pydantic_settings import BaseSettings
 class Settings(BaseSettings):
    # S3 settings
    s3_endpoint: str = "http://localhost:9000"
    s3_access_key: str = "minioadmin"
    s3_secret_key: str = "minioadmin"
    s3_bucket: str = "document-bucket"
    s3_region: str = "us-east-1"
    # Service settings
    host: str = "0.0.0.0"
    port: int = 8082
    # File size limits (bytes)
    max_file_size_pdf: int = 50 * 1024 * 1024  # 50MB
    max_file_size_docx: int = 25 * 1024 * 1024  # 25MB
    max_file_size_xlsx: int = 25 * 1024 * 1024  # 25MB
    max_file_size_jpg: int = 10 * 1024 * 1024  # 10MB
    max_file_size_jpeg: int = 10 * 1024 * 1024  # 10MB
    max_file_size_png: int = 10 * 1024 * 1024  # 10MB
    max_file_size_gif: int = 10 * 1024 * 1024  # 10MB
    max_file_size_default: int = 10 * 1024 * 1024  # 10MB
    # Logging
    log_level: str = "INFO"
    class Config:
        env_file = ".env"
 settings = Settings()
--- a/app/enums.py
+++ b/app/enums.py
@@ -0,0 +1,38 @@
 from enum import Enum
 class DocumentType(str, Enum):
    PDF = "pdf"
    DOCX = "docx"
    XLSX = "xlsx"
    JPG = "jpg"
    JPEG = "jpeg"
    PNG = "png"
    GIF = "gif"
    @classmethod
    def from_mime_type(cls, mime_type: str) -> "DocumentType":
        """Map MIME type to DocumentType"""
        mapping = {
            "application/pdf": cls.PDF,
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": cls.DOCX,
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": cls.XLSX,
            "image/jpeg": cls.JPG,
            "image/png": cls.PNG,
            "image/gif": cls.GIF,
        }
        return mapping.get(mime_type.lower())
    @classmethod
    def from_extension(cls, filename: str) -> "DocumentType":
        """Map file extension to DocumentType"""
        ext = filename.split(".")[-1].lower()
        mapping = {
            "pdf": cls.PDF,
            "docx": cls.DOCX,
            "xlsx": cls.XLSX,
            "jpg": cls.JPG,
            "jpeg": cls.JPEG,
            "png": cls.PNG,
            "gif": cls.GIF,
        }
        return mapping.get(ext)
--- a/app/logger.py
+++ b/app/logger.py
@@ -0,0 +1,13 @@
 import logging
 from app.config import settings
 def setup_logging():
    """Setup logging configuration"""
    logging.basicConfig(
        level=getattr(logging, settings.log_level.upper()),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
 def get_logger(name: str) -> logging.Logger:
    """Get logger with specified name"""
    return logging.getLogger(name)
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,82 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.openapi.utils import get_openapi
 from app.routers import documents
 from app.config import settings
 from app.logger import setup_logging
 from app.middleware.auth import AuthMiddleware
 # Setup logging
 setup_logging()
 app = FastAPI(
    title="Document Service",
    version="1.0.0",
    description="Generic document management service with S3 storage and PDF field discovery",
    openapi_url="/openapi3.json",
    docs_url="/docs",
    redoc_url="/redoc"
 )
 # Add auth middleware
 app.add_middleware(AuthMiddleware)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:3000"],
    allow_methods=["*"],
    allow_headers=["*"]
 )
 app.include_router(documents.router)
@app.on_event("startup")
 async def startup_event():
    """Run startup tasks.
    Raises:
        Exception: If S3 bucket initialization fails (service will fail to start)
    """
    from app import s3
    from app.logger import get_logger
    logger = get_logger(__name__)
    logger.info("Starting up document service...")
    try:
        s3.ensure_bucket_exists()
        logger.info("S3 bucket initialization complete")
    except Exception as e:
        logger.error(f"Failed to initialize S3 bucket: {e}")
        # Re-raise to fail startup
        raise
@app.get("/health", tags=["health"])
 def health():
    return {"status": "ok"}
@app.get("/health/ready", tags=["health"])
 def health_ready():
    """Health check for Kubernetes readiness probes."""
    return {"status": "ready"}
 def custom_openapi():
    if app.openapi_schema:
        return app.openapi_schema
    schema = get_openapi(
        title="Document Service",
        version="1.0.0",
        openapi_version="3.1.0",
        description="Generic document management service with S3 storage and PDF field discovery",
        routes=app.routes
    )
    schema["servers"] = [
        {"url": "http://localhost:8082", "description": "Local dev"}
    ]
    app.openapi_schema = schema
    return app.openapi_schema
 app.openapi = custom_openapi
--- a/app/middleware/init.py
+++ b/app/middleware/init.py
--- a/app/middleware/auth.py
+++ b/app/middleware/auth.py
@@ -0,0 +1,16 @@
 from fastapi import Request
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.responses import JSONResponse
 from app.logger import get_logger
 logger = get_logger(__name__)
 class AuthMiddleware(BaseHTTPMiddleware):
    async def dispatch(self, request: Request, call_next):
        # Skip auth for health endpoint
        if request.url.path == "/health":
            return await call_next(request)
        request.state.org_id = "test"
        response = await call_next(request)
        return response
--- a/app/models.py
+++ b/app/models.py
@@ -0,0 +1,30 @@
 from pydantic import BaseModel, Field
 from datetime import datetime
 from typing import Optional
 from app.enums import DocumentType
 class DocumentMetadata(BaseModel):
    document_id: str = Field(..., description="UUID of the document")
    org_id: str = Field(..., description="Organization ID")
    document_type: DocumentType = Field(..., description="Type of document")
    filename: str = Field(..., description="Original filename")
    content_type: str = Field(..., description="MIME type")
    file_size: int = Field(..., description="File size in bytes")
    s3_key: str = Field(..., description="S3 key for the document")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
 class UploadResponse(BaseModel):
    document_id: str
    metadata: DocumentMetadata
    download_url: str
 class DownloadUrlResponse(BaseModel):
    download_url: str
    s3_key: str
    expires_in: int
 class FieldsResponse(BaseModel):
    document_id: str
    document_type: DocumentType
    fields: list[dict]
--- a/app/pdf.py
+++ b/app/pdf.py
@@ -0,0 +1,105 @@
 import os
 from pypdf import PdfReader
 from typing import Any
 def discover_fields(pdf_path: str) -> list[dict]:
    """
    Introspect a PDF and return all fillable AcroForm fields.
    Handles any form of AcroForm structure.
    """
    reader = PdfReader(pdf_path)
    # Try multiple methods to get fields
    fields = None
    # Method 1: Try get_fields() first
    try:
        fields = reader.get_fields()
    except Exception as e:
        print(f"get_fields() failed: {e}")
        fields = None
    # Method 2: Try to get fields from AcroForm directly
    if not fields:
        try:
            if "/AcroForm" in reader.trailer["/Root"]:
                acroform = reader.trailer["/Root"]["/AcroForm"]
                if "/Fields" in acroform:
                    fields = {}
                    field_array = acroform["/Fields"]
                    for field_ref in field_array:
                        try:
                            field_obj = field_ref.get_object()
                            field_name = field_obj.get("/T", "")
                            if field_name:
                                fields[field_name] = field_obj
                        except Exception as e:
                            print(f"Error processing field: {e}")
                            continue
        except Exception as e:
            print(f"Direct AcroForm access failed: {e}")
            fields = None
    # Method 3: Try to get fields from page annotations
    if not fields:
        try:
            fields = {}
            for page in reader.pages:
                if "/Annots" in page:
                    for annot in page["/Annots"]:
                        try:
                            annot_obj = annot.get_object()
                            if "/Subtype" in annot_obj and annot_obj["/Subtype"] == "/Widget":
                                field_name = annot_obj.get("/T", "")
                                if field_name and field_name not in fields:
                                    fields[field_name] = annot_obj
                        except Exception as e:
                            print(f"Error processing annotation: {e}")
                            continue
        except Exception as e:
            print(f"Page annotation access failed: {e}")
            fields = None
    if not fields:
        return []
    result = []
    for field_name, field_obj in fields.items():
        try:
            field_type = field_obj.get("/FT", "")
            options = []
            # /Ch = choice field (select/dropdown)
            if field_type == "/Ch":
                opt = field_obj.get("/Opt", [])
                if opt:
                    options = [o if isinstance(o, str) else o[1] for o in opt]
            result.append({
                "field": field_name,
                "label": field_name.replace("_", " ").title(),
                "type": _map_field_type(field_type, field_obj),
                "required": False,
                "options": options if options else None
            })
        except Exception as e:
            print(f"Error processing field {field_name}: {e}")
            continue
    return result
 def _map_field_type(ft: str, field_obj: dict) -> str:
    mapping = {
        "/Tx": "string",
        "/Btn": "boolean",
        "/Ch": "select",
        "/Sig": "string"
    }
    base = mapping.get(ft, "string")
    # Check if it's a date field by name hint
    field_name = field_obj.get("/T", "").lower()
    if any(hint in field_name for hint in ["date", "fecha", "birth", "nacimiento"]):
        return "date"
    return base
--- a/app/routers/init.py
+++ b/app/routers/init.py
@@ -0,0 +1 @@
 from app.routers import documents
--- a/app/routers/documents.py
+++ b/app/routers/documents.py
@@ -0,0 +1,355 @@
 import os
 from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
 from typing import Optional
 from datetime import datetime
 from app import s3, pdf, utils
 from app.config import settings
 from app.enums import DocumentType
 from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
 from app.logger import get_logger
 router = APIRouter(prefix="/api/documents", tags=["documents"])
 logger = get_logger(__name__)
@router.post("/upload", response_model=UploadResponse)
 async def upload_document(
    request: Request,
    file: UploadFile = File(...)
 ):
    """Upload a new document"""
    org_id = request.state.org_id
    user_id = getattr(request.state, "user_id", "system")
    logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
    # Detect content type
    detected_content_type = utils.detect_content_type(file)
    logger.info(f"Detected content type: {detected_content_type}")
    # Detect document type
    document_type = utils.detect_document_type(file.filename, detected_content_type)
    if not document_type:
        logger.error(f"Unsupported document type: {file.filename}")
        raise HTTPException(status_code=415, detail="Unsupported document type")
    # Get file size
    file.file.seek(0, os.SEEK_END)
    file_size = file.file.tell()
    file.file.seek(0)
    # Validate file size
    utils.validate_file_size(file_size, document_type)
    # Generate document ID and S3 key
    document_id = utils.generate_document_id()
    sanitized_filename = utils.sanitize_filename(file.filename)
    s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
    # Prepare metadata
    metadata_dict = {
        "org_id": org_id,
        "document_type": document_type.value,
        "filename": file.filename,
        "file_size": str(file_size),
        "created_at": datetime.utcnow().isoformat()
    }
    # Upload to S3
    try:
        s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
        logger.info(f"File uploaded successfully: {s3_key}")
    except Exception as e:
        logger.error(f"Failed to upload file: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
    # Generate download URL
    download_url = s3.presigned_download_url(s3_key)
    # Create metadata response
    metadata = DocumentMetadata(
        document_id=document_id,
        org_id=org_id,
        document_type=document_type,
        filename=file.filename,
        content_type=detected_content_type,
        file_size=file_size,
        s3_key=s3_key,
        created_at=datetime.utcnow(),
        updated_at=datetime.utcnow()
    )
    logger.info(f"Upload completed - document_id: {document_id}")
    return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
@router.put("/{document_id}", response_model=UploadResponse)
 async def rewrite_document(
    request: Request,
    document_id: str,
    file: UploadFile = File(...)
 ):
    """Rewrite/replace an existing document"""
    org_id = request.state.org_id
    user_id = getattr(request.state, "user_id", "system")
    logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
    # Detect content type
    detected_content_type = utils.detect_content_type(file)
    # Detect document type
    document_type = utils.detect_document_type(file.filename, detected_content_type)
    if not document_type:
        raise HTTPException(status_code=415, detail="Unsupported document type")
    # Get file size
    file.file.seek(0, os.SEEK_END)
    file_size = file.file.tell()
    file.file.seek(0)
    # Validate file size
    utils.validate_file_size(file_size, document_type)
    # Generate S3 key
    sanitized_filename = utils.sanitize_filename(file.filename)
    s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
    # Check if document exists
    if not s3.file_exists(s3_key):
        logger.error(f"Document not found: {document_id}")
        raise HTTPException(status_code=404, detail="Document not found")
    # Verify org_id matches
    existing_metadata = s3.get_file_metadata(s3_key)
    if existing_metadata.get("org_id") != org_id:
        logger.error(f"Organization mismatch for document: {document_id}")
        raise HTTPException(status_code=403, detail="Organization mismatch")
    # Prepare metadata
    metadata_dict = {
        "org_id": org_id,
        "document_type": document_type.value,
        "filename": file.filename,
        "file_size": str(file_size),
        "updated_at": datetime.utcnow().isoformat()
    }
    # Upload to S3 (overwrites existing)
    try:
        s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
        logger.info(f"File rewritten successfully: {s3_key}")
    except Exception as e:
        logger.error(f"Failed to rewrite file: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
    # Generate download URL
    download_url = s3.presigned_download_url(s3_key)
    # Create metadata response
    metadata = DocumentMetadata(
        document_id=document_id,
        org_id=org_id,
        document_type=document_type,
        filename=file.filename,
        content_type=detected_content_type,
        file_size=file_size,
        s3_key=s3_key,
        created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
        updated_at=datetime.utcnow()
    )
    logger.info(f"Rewrite completed - document_id: {document_id}")
    return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
@router.get("/{document_id}", response_model=DocumentMetadata)
 async def get_document(request: Request, document_id: str):
    """Get document metadata"""
    org_id = request.state.org_id
    logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
    # List objects to find the document
    client = s3.get_client()
    prefix = utils.s3_path_prefix(org_id, document_id)
    try:
        response = client.list_objects_v2(
            Bucket=settings.s3_bucket,
            Prefix=prefix,
            MaxKeys=1
        )
    except Exception as e:
        logger.error(f"Failed to list objects: {e}")
        raise HTTPException(status_code=500, detail="Failed to retrieve document")
    if not response.get("Contents"):
        logger.error(f"Document not found: {document_id}")
        raise HTTPException(status_code=404, detail="Document not found")
    s3_key = response["Contents"][0]["Key"]
    # Get metadata from S3
    s3_metadata = s3.get_file_metadata(s3_key)
    # Verify org_id matches
    if s3_metadata.get("org_id") != org_id:
        logger.error(f"Organization mismatch for document: {document_id}")
        raise HTTPException(status_code=403, detail="Organization mismatch")
    # Get object info
    try:
        object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
    except Exception as e:
        logger.error(f"Failed to get object info: {e}")
        raise HTTPException(status_code=500, detail="Failed to retrieve document")
    # Create metadata response
    metadata = DocumentMetadata(
        document_id=document_id,
        org_id=s3_metadata.get("org_id"),
        document_type=DocumentType(s3_metadata.get("document_type")),
        filename=s3_metadata.get("filename"),
        content_type=object_info.get("ContentType"),
        file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
        s3_key=s3_key,
        created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
        updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
    )
    logger.info(f"Get document completed - document_id: {document_id}")
    return metadata
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
 async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
    """Get presigned download URL"""
    org_id = request.state.org_id
    logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
    # List objects to find the document
    client = s3.get_client()
    prefix = utils.s3_path_prefix(org_id, document_id)
    try:
        response = client.list_objects_v2(
            Bucket=settings.s3_bucket,
            Prefix=prefix,
            MaxKeys=1
        )
    except Exception as e:
        logger.error(f"Failed to list objects: {e}")
        raise HTTPException(status_code=500, detail="Failed to retrieve document")
    if not response.get("Contents"):
        logger.error(f"Document not found: {document_id}")
        raise HTTPException(status_code=404, detail="Document not found")
    s3_key = response["Contents"][0]["Key"]
    # Verify org_id matches
    s3_metadata = s3.get_file_metadata(s3_key)
    if s3_metadata.get("org_id") != org_id:
        logger.error(f"Organization mismatch for document: {document_id}")
        raise HTTPException(status_code=403, detail="Organization mismatch")
    # Generate download URL
    download_url = s3.presigned_download_url(s3_key, expires_in)
    logger.info(f"Download URL generated - document_id: {document_id}")
    return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
@router.get("/{document_id}/fields", response_model=FieldsResponse)
 async def get_document_fields(request: Request, document_id: str):
    """Get PDF form fields (PDF only)"""
    org_id = request.state.org_id
    logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
    # List objects to find the document
    client = s3.get_client()
    prefix = utils.s3_path_prefix(org_id, document_id)
    try:
        response = client.list_objects_v2(
            Bucket=settings.s3_bucket,
            Prefix=prefix,
            MaxKeys=1
        )
    except Exception as e:
        logger.error(f"Failed to list objects: {e}")
        raise HTTPException(status_code=500, detail="Failed to retrieve document")
    if not response.get("Contents"):
        logger.error(f"Document not found: {document_id}")
        raise HTTPException(status_code=404, detail="Document not found")
    s3_key = response["Contents"][0]["Key"]
    # Get metadata
    s3_metadata = s3.get_file_metadata(s3_key)
    # Verify org_id matches
    if s3_metadata.get("org_id") != org_id:
        logger.error(f"Organization mismatch for document: {document_id}")
        raise HTTPException(status_code=403, detail="Organization mismatch")
    # Check if PDF
    document_type = s3_metadata.get("document_type")
    if document_type != DocumentType.PDF.value:
        logger.error(f"Document is not PDF: {document_type}")
        raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
    # Download and discover fields
    try:
        pdf_path = s3.download_to_temp(s3_key)
        fields = pdf.discover_fields(pdf_path)
        logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
    except Exception as e:
        logger.error(f"Failed to discover fields: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
    finally:
        if os.path.exists(pdf_path):
            os.unlink(pdf_path)
    return FieldsResponse(
        document_id=document_id,
        document_type=DocumentType.PDF,
        fields=fields
    )
@router.delete("/{document_id}")
 async def delete_document(request: Request, document_id: str):
    """Delete document"""
    org_id = request.state.org_id
    logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
    # List objects to find the document
    client = s3.get_client()
    prefix = utils.s3_path_prefix(org_id, document_id)
    try:
        response = client.list_objects_v2(
            Bucket=settings.s3_bucket,
            Prefix=prefix,
            MaxKeys=1
        )
    except Exception as e:
        logger.error(f"Failed to list objects: {e}")
        raise HTTPException(status_code=500, detail="Failed to retrieve document")
    if not response.get("Contents"):
        logger.error(f"Document not found: {document_id}")
        raise HTTPException(status_code=404, detail="Document not found")
    s3_key = response["Contents"][0]["Key"]
    # Verify org_id matches
    s3_metadata = s3.get_file_metadata(s3_key)
    if s3_metadata.get("org_id") != org_id:
        logger.error(f"Organization mismatch for document: {document_id}")
        raise HTTPException(status_code=403, detail="Organization mismatch")
    # Delete from S3
    try:
        s3.delete_file(s3_key)
        logger.info(f"Document deleted - document_id: {document_id}")
    except Exception as e:
        logger.error(f"Failed to delete document: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
    return {"message": "Document deleted successfully"}
--- a/app/s3.py
+++ b/app/s3.py
@@ -0,0 +1,101 @@
 import boto3
 import tempfile
 import os
 from botocore.client import Config
 from fastapi import UploadFile
 from app.config import settings
 from app.logger import get_logger
 logger = get_logger(__name__)
 def get_client():
    return boto3.client(
        "s3",
        endpoint_url=settings.s3_endpoint,
        aws_access_key_id=settings.s3_access_key,
        aws_secret_access_key=settings.s3_secret_key,
        config=Config(signature_version="s3v4"),
        region_name=settings.s3_region
    )
 def ensure_bucket_exists() -> None:
    """Ensure the S3 bucket exists, create it if it doesn't exist.
    Raises:
        Exception: If bucket creation fails (service will fail to start)
    """
    client = get_client()
    try:
        client.head_bucket(Bucket=settings.s3_bucket)
        logger.info(f"Bucket '{settings.s3_bucket}' already exists")
    except client.exceptions.ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            try:
                client.create_bucket(
                    Bucket=settings.s3_bucket,
                    CreateBucketConfiguration={
                        'LocationConstraint': settings.s3_region
                    }
                )
                logger.info(f"Created bucket '{settings.s3_bucket}'")
            except Exception as create_error:
                logger.error(f"Failed to create bucket '{settings.s3_bucket}': {create_error}")
                raise
        else:
            logger.error(f"Error checking bucket: {e}")
            raise
 def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
    """Upload file to S3 with metadata"""
    client = get_client()
    extra_args = {"ContentType": content_type}
    if metadata:
        extra_args["Metadata"] = metadata
    client.upload_fileobj(
        file.file,
        settings.s3_bucket,
        s3_key,
        ExtraArgs=extra_args
    )
    return s3_key
 def delete_file(s3_key: str) -> None:
    """Delete file from S3"""
    client = get_client()
    client.delete_object(Bucket=settings.s3_bucket, Key=s3_key)
 def file_exists(s3_key: str) -> bool:
    """Check if file exists in S3"""
    client = get_client()
    try:
        client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
        return True
    except client.exceptions.ClientError:
        return False
 def get_file_metadata(s3_key: str) -> dict:
    """Get file metadata from S3"""
    client = get_client()
    response = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
    return response.get("Metadata", {})
 def download_to_temp(s3_key: str) -> str:
    """Download file from S3 to temp file"""
    client = get_client()
    suffix = os.path.splitext(s3_key)[-1] or ".tmp"
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
    client.download_fileobj(settings.s3_bucket, s3_key, tmp)
    tmp.close()
    return tmp.name
 def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
    """Generate presigned download URL"""
    client = get_client()
    return client.generate_presigned_url(
        "get_object",
        Params={"Bucket": settings.s3_bucket, "Key": s3_key},
        ExpiresIn=expires_in
    )
--- a/app/utils.py
+++ b/app/utils.py
@@ -0,0 +1,66 @@
 import uuid
 import magic
 from fastapi import HTTPException, UploadFile
 from app.config import settings
 from app.enums import DocumentType
 def generate_document_id() -> str:
    """Generate UUID for document"""
    return str(uuid.uuid4())
 def s3_path_prefix(org_id: str, document_id: str) -> str:
    """Generate S3 path prefix for document operations"""
    return f"documents/{org_id}/{document_id}/"
 def detect_content_type(file: UploadFile) -> str:
    """Detect content type using python-magic"""
    file.file.seek(0)
    content = file.file.read(2048)
    file.file.seek(0)
    mime = magic.Magic(mime=True)
    return mime.from_buffer(content)
 def detect_document_type(filename: str, content_type: str) -> DocumentType:
    """Detect document type from filename and content type"""
    # Try content type first
    doc_type = DocumentType.from_mime_type(content_type)
    if doc_type:
        return doc_type
    # Fall back to extension
    return DocumentType.from_extension(filename)
 def get_file_size_limit(document_type: DocumentType) -> int:
    """Get max file size for document type"""
    limits = {
        DocumentType.PDF: settings.max_file_size_pdf,
        DocumentType.DOCX: settings.max_file_size_docx,
        DocumentType.XLSX: settings.max_file_size_xlsx,
        DocumentType.JPG: settings.max_file_size_jpg,
        DocumentType.JPEG: settings.max_file_size_jpeg,
        DocumentType.PNG: settings.max_file_size_png,
        DocumentType.GIF: settings.max_file_size_gif,
    }
    return limits.get(document_type, settings.max_file_size_default)
 def validate_file_size(file_size: int, document_type: DocumentType) -> None:
    """Validate file size against limits"""
    max_size = get_file_size_limit(document_type)
    if file_size > max_size:
        raise HTTPException(
            status_code=413,
            detail=f"File size {file_size} exceeds maximum {max_size} for {document_type.value}"
        )
 def document_s3_key(org_id: str, document_id: str, filename: str) -> str:
    """Generate S3 key for document"""
    return f"{s3_path_prefix(org_id, document_id)}{filename}"
 def sanitize_filename(filename: str) -> str:
    """Sanitize filename for S3"""
    # Remove path separators and special characters
    filename = filename.replace("/", "_").replace("\\", "_")
    # Keep only safe characters
    safe_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_")
    return "".join(c for c in filename if c in safe_chars)
--- a/flake.lock
+++ b/flake.lock
@@ -0,0 +1,99 @@
 {
  "nodes": {
    "nixpkgs": {
      "locked": {
        "lastModified": 1776548001,
        "narHash": "sha256-ZSK0NL4a1BwVbbTBoSnWgbJy9HeZFXLYQizjb2DPF24=",
        "owner": "nixos",
        "repo": "nixpkgs",
        "rev": "b12141ef619e0a9c1c84dc8c684040326f27cdcc",
        "type": "github"
      },
      "original": {
        "owner": "nixos",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "pyproject-build-systems": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
        ],
        "pyproject-nix": [
          "pyproject-nix"
        ],
        "uv2nix": [
          "uv2nix"
        ]
      },
      "locked": {
        "lastModified": 1776659114,
        "narHash": "sha256-qapCOQmR++yZSY43dzrp3wCrkOTLpod+ONtJWBk6iKU=",
        "owner": "pyproject-nix",
        "repo": "build-system-pkgs",
        "rev": "ffaa2161dd5d63e0e94591f86b54fc239660fb2e",
        "type": "github"
      },
      "original": {
        "owner": "pyproject-nix",
        "repo": "build-system-pkgs",
        "type": "github"
      }
    },
    "pyproject-nix": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
        ]
      },
      "locked": {
        "lastModified": 1776715674,
        "narHash": "sha256-Gs1VnEkCkkRZxJQAC/Dhz0Jbfi22mFXChbtNg9w/Ybg=",
        "owner": "pyproject-nix",
        "repo": "pyproject.nix",
        "rev": "69f57f27e52a87c54e28138a75ec741cd46663c9",
        "type": "github"
      },
      "original": {
        "owner": "pyproject-nix",
        "repo": "pyproject.nix",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "nixpkgs": "nixpkgs",
        "pyproject-build-systems": "pyproject-build-systems",
        "pyproject-nix": "pyproject-nix",
        "uv2nix": "uv2nix"
      }
    },
    "uv2nix": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
        ],
        "pyproject-nix": [
          "pyproject-nix"
        ]
      },
      "locked": {
        "lastModified": 1776718528,
        "narHash": "sha256-XeGmo/BhkFXd8vVyendr3X4mQmw7CEkeQcpy7AHbVcg=",
        "owner": "pyproject-nix",
        "repo": "uv2nix",
        "rev": "60982c30e16db3e0cba6c0ed13f0894b06ab2bf1",
        "type": "github"
      },
      "original": {
        "owner": "pyproject-nix",
        "repo": "uv2nix",
        "type": "github"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@@ -0,0 +1,143 @@
 {
  description = "document-service using uv2nix";
  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
    pyproject-nix = {
      url = "github:pyproject-nix/pyproject.nix";
      inputs.nixpkgs.follows = "nixpkgs";
    };
    uv2nix = {
      url = "github:pyproject-nix/uv2nix";
      inputs.pyproject-nix.follows = "pyproject-nix";
      inputs.nixpkgs.follows = "nixpkgs";
    };
    pyproject-build-systems = {
      url = "github:pyproject-nix/build-system-pkgs";
      inputs.pyproject-nix.follows = "pyproject-nix";
      inputs.uv2nix.follows = "uv2nix";
      inputs.nixpkgs.follows = "nixpkgs";
    };
  };
  outputs =
    {
      nixpkgs,
      pyproject-nix,
      uv2nix,
      pyproject-build-systems,
      ...
    }:
    let
      inherit (nixpkgs) lib;
      forAllSystems = lib.genAttrs lib.systems.flakeExposed;
      workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
      overlay = workspace.mkPyprojectOverlay {
        sourcePreference = "wheel";
      };
      editableOverlay = workspace.mkEditablePyprojectOverlay {
        root = "$REPO_ROOT";
      };
      pythonSets = forAllSystems (
        system:
        let
          pkgs = nixpkgs.legacyPackages.${system};
          python = pkgs.python3;
        in
        (pkgs.callPackage pyproject-nix.build.packages {
          inherit python;
        }).overrideScope
          (
            lib.composeManyExtensions [
              pyproject-build-systems.overlays.wheel
              overlay
            ]
          )
      );
    in
    {
      devShells = forAllSystems (
        system:
        let
          pkgs = nixpkgs.legacyPackages.${system};
          pythonSet = pythonSets.${system}.overrideScope editableOverlay;
          virtualenv = pythonSet.mkVirtualEnv "document-service-dev-env" workspace.deps.all;
        in
        {
          default = pkgs.mkShell {
            packages = [
              virtualenv
              pkgs.uv
              pkgs.pyright
              pkgs.file
            ];
            env = {
              UV_NO_SYNC = "1";
              UV_PYTHON = pythonSet.python.interpreter;
              UV_PYTHON_DOWNLOADS = "never";
              LD_LIBRARY_PATH = "${pkgs.file.out}/lib:$LD_LIBRARY_PATH";
            };
            shellHook = ''
              unset PYTHONPATH
              export REPO_ROOT=$(git rev-parse --show-toplevel)
            '';
          };
        }
      );
       packages = forAllSystems (system: let
         pkgs = nixpkgs.legacyPackages.${system};
         pythonSet = pythonSets.${system}.overrideScope editableOverlay;
         virtualenv = pythonSet.mkVirtualEnv "document-service-env" workspace.deps.default;
         # Create a derivation that includes the application code
         appCode = pkgs.stdenv.mkDerivation {
           name = "document-service-code";
           src = ./.;
           installPhase = ''
             mkdir -p $out/app
             cp -r app/* $out/app/
             cp pyproject.toml $out/
           '';
         };
       in {
         default = virtualenv;
         dockerImage = pkgs.dockerTools.buildLayeredImage {
           name = "document-service";
           contents = [
             virtualenv
             pkgs.bashInteractive
             pkgs.busybox
             pkgs.shadow
             pkgs.file
             pkgs.git  # Include git for version info
             appCode  # Include application code
           ];
           config = {
             Cmd = ["/bin/python" "-m" "uvicorn" "app.main:app" "--host" "0.0.0.0" "--port" "8082"];
             Env = [
               "PYTHONUNBUFFERED=1"
               "PYTHONPATH=/app"
               "S3_ENDPOINT"
               "S3_ACCESS_KEY"
               "S3_SECRET_KEY"
               "S3_BUCKET"
               "S3_REGION"
               "HOST"
               "PORT"
               "LOG_LEVEL"
             ];
             WorkingDir = "/app";
           };
         };
       });
    };
 }
--- a/ops/chart/Chart.yaml
+++ b/ops/chart/Chart.yaml
@@ -0,0 +1,14 @@
 apiVersion: v2
 name: document-service
 description: Generic document management service
 type: application
 version: 1.0.0
 appVersion: "1.0.0"
 keywords:
  - python
  - fastapi
  - document-management
 dependencies:
  - name: common
    version: "4.6.2"
    repository: https://bjw-s-labs.github.io/helm-charts/
--- a/ops/chart/values.yaml
+++ b/ops/chart/values.yaml
@@ -0,0 +1,78 @@
 controllers:
  main:
    enabled: true
    type: deployment
    replicas: 1
    containers:
      main:
        image:
          repository: gitea.corredorconect.com/software-engineering/document-service
          tag: '{{ $.Chart.AppVersion }}'
        env:
          LOG_LEVEL: info
          PORT: "8082"
          S3_ENDPOINT:
            value: "http://minio:9000"
          S3_ACCESS_KEY:
            valueFrom:
              secretKeyRef:
                name: '{{ include "bjw-s.common.lib.chart.names.fullname" $ }}-secrets'
                key: s3AccessKey
          S3_SECRET_KEY:
            valueFrom:
              secretKeyRef:
                name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-secrets'
                key: s3SecretKey
          S3_BUCKET:
            value: "document-bucket"
          S3_REGION:
            value: "us-east-1"
        probes:
          liveness:
            enabled: true
          custom: true
          spec:
            httpGet:
              path: /health
              port: 8082
            initialDelaySeconds: 10
            periodSeconds: 10
          readiness:
            enabled: true
          custom: true
            spec:
              httpGet:
                path: /health/ready
                port: 8082
            initialDelaySeconds: 5
            periodSeconds: 5
 service:
  main:
    controller: main
    type: ClusterIP
    ports:
      http:
        port: 8082
        protocol: HTTP
 external-secret:
  enabled: true
  apiVersion: external-secrets.io/v1
  kind: ExternalSecret
  suffix: secrets
  spec:
    spec:
      refreshInterval: 0s
      secretStoreRef:
        name: cluster-secrets-store
        kind: ClusterSecretStore
      target:
        name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-secrets'
        creationPolicy: Owner
      dataFrom:
        - sourceRef:
            generatorRef:
              apiVersion: generators.external-secrets.io/v1alpha1
              kind: Password
              name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-password-generator'
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,32 @@
 [project]
 name = "document-service"
 version = "1.0.0"
 requires-python = ">=3.12"
 dependencies = [
    "fastapi>=0.115.0",
    "uvicorn[standard]>=0.30.0",
    "pypdf>=4.3.1",
    "boto3>=1.35.0",
    "python-multipart>=0.0.9",
    "pydantic>=2.8.0",
    "pydantic-settings>=2.4.0",
    "python-magic>=0.4.27",
 ]
 [build-system]
 requires = ["hatchling", "editables"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["app"]
 [dependency-groups]
 dev = [
    "ruff>=0.6.0",
    "pytest>=8.0.0",
    "pytest-asyncio>=0.23.0",
    "httpx>=0.27.0",
    "reportlab>=4.0.0",
    "pypdf>=4.3.1",
    "moto>=5.0.0",
 ]
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,68 @@
 """
 Test configuration and fixtures for document-service tests.
 """
 import pytest
 import os
 from fastapi.testclient import TestClient
 from unittest.mock import Mock, patch
 from moto import mock_aws
 import boto3
 from app.main import app
 # Test data paths
 FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
@pytest.fixture
 def test_client():
    """Create a test client with auth bypass."""
    return TestClient(app)
@pytest.fixture
 def sample_org_id():
    """Sample organization ID for testing."""
    return "test-org-123"
@pytest.fixture
 def sample_document_id():
    """Sample document ID for testing."""
    return "test-doc-456"
@pytest.fixture
 def test_pdf_files():
    """Paths to test PDF files."""
    return {
        "simple_form": os.path.join(FIXTURES_DIR, "simple_form.pdf"),
        "complex_form": os.path.join(FIXTURES_DIR, "complex_form.pdf"),
        "no_form": os.path.join(FIXTURES_DIR, "no_form.pdf"),
        "large_form": os.path.join(FIXTURES_DIR, "large_form.pdf"),
    }
@pytest.fixture
 def mock_s3_client():
    """Create a mock S3 client for testing."""
    with mock_aws():
        client = boto3.client(
            "s3",
            region_name="us-east-1",
            aws_access_key_id="minioadmin",
            aws_secret_access_key="minioadmin",
        )
        # Create test bucket
        client.create_bucket(Bucket="document-bucket")
        yield client
@pytest.fixture
 def auth_bypass_middleware():
    """Fixture to bypass auth middleware in tests."""
    def bypass_auth(request):
        request.state.org_id = "test-org-123"
        return request
    return bypass_auth
@pytest.fixture
 def sample_auth_token():
    """Sample auth token for testing."""
    return "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJvcmdfaWQiOiJ0ZXN0LW9yZy0xMjMifQ.test"
--- a/tests/fixtures/complex_form.pdf
+++ b/tests/fixtures/complex_form.pdf
@@ -0,0 +1,304 @@
 %PDF-1.3
 %âãÏÓ
 1 0 obj
 <<
 /Producer (pypdf)
 >>
 endobj
 2 0 obj
 <<
 /Type /Pages
 /Count 1
 /Kids [ 4 0 R ]
 >>
 endobj
 3 0 obj
 <<
 /Type /Catalog
 /Pages 2 0 R
 /AcroForm <<
 /Fields [ <<
 /FT /Tx
 /T (first\137name)
 /V ()
 /Rect [ 200 690 400 710 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (last\137name)
 /V ()
 /Rect [ 200 640 400 660 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (email)
 /V ()
 /Rect [ 200 590 400 610 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (phone)
 /V ()
 /Rect [ 200 540 400 560 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (address)
 /V ()
 /Rect [ 200 490 400 510 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (city)
 /V ()
 /Rect [ 200 440 400 460 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (state)
 /V ()
 /Rect [ 200 390 400 410 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (zip\137code)
 /V ()
 /Rect [ 200 340 400 360 ]
 /Ff 0
 >> <<
 /FT /Ch
 /T (country)
 /V ()
 /Opt [ (USA) (Canada) (UK) (Germany) (France) ]
 /Rect [ 200 290 400 310 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (gender)
 /V (male)
 /Rect [ 200 240 220 260 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (gender)
 /V (female)
 /Rect [ 300 240 320 260 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (reading)
 /V /Off
 /Rect [ 200 190 220 210 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (sports)
 /V /Off
 /Rect [ 200 160 220 180 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (music)
 /V /Off
 /Rect [ 200 130 220 150 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (travel)
 /V /Off
 /Rect [ 200 100 220 120 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (agree\137terms)
 /V /Off
 /Rect [ 200 140 220 160 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (signature)
 /V ()
 /Rect [ 200 90 400 110 ]
 /Ff 0
 >> ]
 >>
 >>
 endobj
 4 0 obj
 <<
 /Contents 5 0 R
 /MediaBox [ 0 0 612 792 ]
 /Resources <<
 /Font 6 0 R
 /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >>
 /Rotate 0
 /Trans <<
 >>
 /Type /Page
 /Parent 2 0 R
 /Annots [ <<
 /Subtype /Widget
 /FT /Tx
 /T (first\137name)
 /V ()
 /Rect [ 200 690 400 710 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (last\137name)
 /V ()
 /Rect [ 200 640 400 660 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (email)
 /V ()
 /Rect [ 200 590 400 610 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (phone)
 /V ()
 /Rect [ 200 540 400 560 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (address)
 /V ()
 /Rect [ 200 490 400 510 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (city)
 /V ()
 /Rect [ 200 440 400 460 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (state)
 /V ()
 /Rect [ 200 390 400 410 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (zip\137code)
 /V ()
 /Rect [ 200 340 400 360 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Ch
 /T (country)
 /V ()
 /Rect [ 200 290 400 310 ]
 /Ff 0
 /Opt [ (USA) (Canada) (UK) (Germany) (France) ]
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (gender)
 /V (male)
 /Rect [ 200 240 220 260 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (gender)
 /V (female)
 /Rect [ 300 240 320 260 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (reading)
 /V /Off
 /Rect [ 200 190 220 210 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (sports)
 /V /Off
 /Rect [ 200 160 220 180 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (music)
 /V /Off
 /Rect [ 200 130 220 150 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (travel)
 /V /Off
 /Rect [ 200 100 220 120 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (agree\137terms)
 /V /Off
 /Rect [ 200 140 220 160 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (signature)
 /V ()
 /Rect [ 200 90 400 110 ]
 /Ff 0
 >> ]
 >>
 endobj
 5 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ]
 /Length 291
 >>
 stream
 GasbV_+Fea&;KY%MZ9UrC9m8.oN"UdKHc".Gmj%B,>D(A;p`!tWO(4\)'k<]nE'P8R95j8f]2oKJNJY1f"tI,Dm8oIL>-,'An-7/XP_7&hmsPV2$VZlJVuKljga3q-e_fL*;+[hpAoJXWqmrLU,"s52O'g'kTenY-)^6!E]<t>XGGKULRl:>id?'u8b4h!>BX;G^/rC%S5.uq%27\VHe*eP7/%>f=QN:Hc+'*-ihD-.,/'o(;:.X+4s[#!Dq5i9,$f'o&NC;.U."[j3.eA/Se#D\)eRtd.%ou~>
 endstream
 endobj
 6 0 obj
 <<
 /F1 7 0 R
 >>
 endobj
 7 0 obj
 <<
 /BaseFont /Helvetica
 /Encoding /WinAnsiEncoding
 /Name /F1
 /Subtype /Type1
 /Type /Font
 >>
 endobj
 xref
 0 8
 0000000000 65535 f 
 0000000015 00000 n 
 0000000054 00000 n 
 0000000113 00000 n 
 0000001378 00000 n 
 0000003056 00000 n 
 0000003438 00000 n 
 0000003469 00000 n 
 trailer
 <<
 /Size 8
 /Root 3 0 R
 /Info 1 0 R
 >>
 startxref
 3576
 %%EOF
--- a/tests/fixtures/generate_test_pdfs.py
+++ b/tests/fixtures/generate_test_pdfs.py
@@ -0,0 +1,371 @@
 """
 Generate test PDF files for document-service testing.
 This script creates various test PDFs with actual AcroForm fields:
 - Simple form PDF with basic form fields
 - Complex form PDF with multiple field types
 - No form PDF without form fields
 - Large form PDF for size validation testing
 """
 import os
 from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import letter
 from reportlab.lib import colors
 from pypdf import PdfReader, PdfWriter
 from pypdf.generic import (
    NameObject,
    create_string_object,
    NumberObject,
    ArrayObject,
    DictionaryObject,
    BooleanObject,
 )
 # Output directory
 OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
 def create_simple_form_pdf():
    """Create a simple PDF with basic form fields."""
    output_path = os.path.join(OUTPUT_DIR, "simple_form.pdf")
    # Create base PDF with reportlab
    c = canvas.Canvas(output_path, pagesize=letter)
    c.setFont("Helvetica", 16)
    c.drawString(100, 750, "Simple Form Test")
    c.setFont("Helvetica", 12)
    c.drawString(100, 700, "Name:")
    c.drawString(100, 650, "Email:")
    c.drawString(100, 600, "Phone:")
    c.drawString(100, 550, "Country:")
    c.drawString(100, 500, "Birth Date:")
    c.drawString(100, 450, "Agree to Terms:")
    c.save()
    # Add actual form fields using pypdf
    reader = PdfReader(output_path)
    writer = PdfWriter()
    # Copy the page
    page = reader.pages[0]
    writer.add_page(page)
    # Create form fields
    fields = []
    # Name field (text)
    name_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Tx"),
        NameObject("/T"): create_string_object("name"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(690), NumberObject(400), NumberObject(710)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(name_field)
    # Email field (text)
    email_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Tx"),
        NameObject("/T"): create_string_object("email"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(640), NumberObject(400), NumberObject(660)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(email_field)
    # Phone field (text)
    phone_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Tx"),
        NameObject("/T"): create_string_object("phone"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(590), NumberObject(400), NumberObject(610)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(phone_field)
    # Country field (dropdown/choice)
    country_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Ch"),
        NameObject("/T"): create_string_object("country"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Opt"): ArrayObject([
            create_string_object("USA"),
            create_string_object("Canada"),
            create_string_object("UK"),
            create_string_object("Germany"),
            create_string_object("France"),
        ]),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(540), NumberObject(400), NumberObject(560)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(country_field)
    # Birth date field (text)
    birth_date_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Tx"),
        NameObject("/T"): create_string_object("birth_date"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(490), NumberObject(400), NumberObject(510)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(birth_date_field)
    # Agree terms field (checkbox)
    agree_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Btn"),
        NameObject("/T"): create_string_object("agree_terms"),
        NameObject("/V"): NameObject("/Off"),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(440), NumberObject(220), NumberObject(460)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(agree_field)
    # Add fields to the page
    page = writer.pages[0]
    if "/Annots" not in page:
        page[NameObject("/Annots")] = ArrayObject()
    for field in fields:
        field_obj = DictionaryObject({
            NameObject("/Subtype"): NameObject("/Widget"),
            NameObject("/FT"): field[NameObject("/FT")],
            NameObject("/T"): field[NameObject("/T")],
            NameObject("/V"): field.get(NameObject("/V"), NameObject("")),
            NameObject("/Rect"): field[NameObject("/Rect")],
            NameObject("/Ff"): field.get(NameObject("/Ff"), NumberObject(0)),
        })
        if NameObject("/Opt") in field:
            field_obj[NameObject("/Opt")] = field[NameObject("/Opt")]
        page[NameObject("/Annots")].append(field_obj)
    # Add AcroForm to the document
    acroform = DictionaryObject({
        NameObject("/Fields"): ArrayObject(fields),
    })
    writer._root_object[NameObject("/AcroForm")] = acroform
    # Save the PDF
    with open(output_path, "wb") as f:
        writer.write(f)
    print(f"Created: {output_path}")
 def create_complex_form_pdf():
    """Create a complex PDF with multiple field types."""
    output_path = os.path.join(OUTPUT_DIR, "complex_form.pdf")
    # Create base PDF with reportlab
    c = canvas.Canvas(output_path, pagesize=letter)
    c.setFont("Helvetica", 16)
    c.drawString(100, 750, "Complex Form Test")
    c.setFont("Helvetica", 12)
    c.drawString(100, 700, "First Name:")
    c.drawString(100, 650, "Last Name:")
    c.drawString(100, 600, "Email:")
    c.drawString(100, 550, "Phone:")
    c.drawString(100, 500, "Address:")
    c.drawString(100, 450, "City:")
    c.drawString(100, 400, "State:")
    c.drawString(100, 350, "Zip Code:")
    c.drawString(100, 300, "Country:")
    c.drawString(100, 250, "Gender:")
    c.drawString(100, 200, "Interests:")
    c.drawString(100, 150, "Agree to Terms:")
    c.drawString(100, 100, "Signature:")
    c.save()
    # Add actual form fields using pypdf
    reader = PdfReader(output_path)
    writer = PdfWriter()
    # Copy the page
    page = reader.pages[0]
    writer.add_page(page)
    # Create form fields
    fields = []
    # Text fields
    text_fields = [
        ('first_name', 200, 690),
        ('last_name', 200, 640),
        ('email', 200, 590),
        ('phone', 200, 540),
        ('address', 200, 490),
        ('city', 200, 440),
        ('state', 200, 390),
        ('zip_code', 200, 340),
    ]
    for name, x, y in text_fields:
        field = DictionaryObject({
            NameObject("/FT"): NameObject("/Tx"),
            NameObject("/T"): create_string_object(name),
            NameObject("/V"): create_string_object(""),
            NameObject("/Rect"): ArrayObject([NumberObject(x), NumberObject(y), NumberObject(x + 200), NumberObject(y + 20)]),
            NameObject("/Ff"): NumberObject(0),
        })
        fields.append(field)
    # Country dropdown
    country_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Ch"),
        NameObject("/T"): create_string_object("country"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Opt"): ArrayObject([
            create_string_object("USA"),
            create_string_object("Canada"),
            create_string_object("UK"),
            create_string_object("Germany"),
            create_string_object("France"),
        ]),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(290), NumberObject(400), NumberObject(310)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(country_field)
    # Radio buttons for gender
    male_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Btn"),
        NameObject("/T"): create_string_object("gender"),
        NameObject("/V"): create_string_object("male"),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(240), NumberObject(220), NumberObject(260)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(male_field)
    female_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Btn"),
        NameObject("/T"): create_string_object("gender"),
        NameObject("/V"): create_string_object("female"),
        NameObject("/Rect"): ArrayObject([NumberObject(300), NumberObject(240), NumberObject(320), NumberObject(260)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(female_field)
    # Checkboxes for interests
    interests = ['reading', 'sports', 'music', 'travel']
    for i, interest in enumerate(interests):
        field = DictionaryObject({
            NameObject("/FT"): NameObject("/Btn"),
            NameObject("/T"): create_string_object(interest),
            NameObject("/V"): NameObject("/Off"),
            NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(190 - (i * 30)), NumberObject(220), NumberObject(210 - (i * 30))]),
            NameObject("/Ff"): NumberObject(0),
        })
        fields.append(field)
    # Checkbox for agree terms
    agree_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Btn"),
        NameObject("/T"): create_string_object("agree_terms"),
        NameObject("/V"): NameObject("/Off"),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(140), NumberObject(220), NumberObject(160)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(agree_field)
    # Signature field
    signature_field = DictionaryObject({
        NameObject("/FT"): NameObject("/Tx"),
        NameObject("/T"): create_string_object("signature"),
        NameObject("/V"): create_string_object(""),
        NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(90), NumberObject(400), NumberObject(110)]),
        NameObject("/Ff"): NumberObject(0),
    })
    fields.append(signature_field)
    # Add fields to the page
    page = writer.pages[0]
    if "/Annots" not in page:
        page[NameObject("/Annots")] = ArrayObject()
    for field in fields:
        field_obj = DictionaryObject({
            NameObject("/Subtype"): NameObject("/Widget"),
            NameObject("/FT"): field[NameObject("/FT")],
            NameObject("/T"): field[NameObject("/T")],
            NameObject("/V"): field.get(NameObject("/V"), NameObject("")),
            NameObject("/Rect"): field[NameObject("/Rect")],
            NameObject("/Ff"): field.get(NameObject("/Ff"), NumberObject(0)),
        })
        if NameObject("/Opt") in field:
            field_obj[NameObject("/Opt")] = field[NameObject("/Opt")]
        page[NameObject("/Annots")].append(field_obj)
    # Add AcroForm to the document
    acroform = DictionaryObject({
        NameObject("/Fields"): ArrayObject(fields),
    })
    writer._root_object[NameObject("/AcroForm")] = acroform
    # Save the PDF
    with open(output_path, "wb") as f:
        writer.write(f)
    print(f"Created: {output_path}")
 def create_no_form_pdf():
    """Create a PDF without form fields."""
    output_path = os.path.join(OUTPUT_DIR, "no_form.pdf")
    # Create simple PDF without form fields
    c = canvas.Canvas(output_path, pagesize=letter)
    c.setFont("Helvetica", 16)
    c.drawString(100, 750, "No Form Test")
    c.setFont("Helvetica", 12)
    c.drawString(100, 700, "This PDF has no form fields.")
    c.drawString(100, 650, "It is used for testing field discovery")
    c.drawString(100, 600, "on documents without AcroForm fields.")
    c.save()
    print(f"Created: {output_path}")
 def create_large_form_pdf():
    """Create a large PDF for size validation testing."""
    output_path = os.path.join(OUTPUT_DIR, "large_form.pdf")
    # Create a larger PDF with more content
    c = canvas.Canvas(output_path, pagesize=letter)
    c.setFont("Helvetica", 16)
    c.drawString(100, 750, "Large Form Test")
    c.setFont("Helvetica", 12)
    y = 700
    for i in range(50):
        c.drawString(100, y, f"Field {i + 1}:")
        y -= 50
        if y < 50:
            c.showPage()
            y = 700
    c.save()
    print(f"Created: {output_path}")
 def main():
    """Generate all test PDF files."""
    print("Generating test PDF files...")
    print(f"Output directory: {OUTPUT_DIR}")
    print()
    create_simple_form_pdf()
    create_complex_form_pdf()
    create_no_form_pdf()
    create_large_form_pdf()
    print()
    print("All test PDF files generated successfully!")
 if __name__ == "__main__":
    main()
--- a/tests/fixtures/large_form.pdf
+++ b/tests/fixtures/large_form.pdf
@@ -0,0 +1,125 @@
 %PDF-1.3
 %“Œ‹ž ReportLab Generated PDF document (opensource)
 1 0 obj
 <<
 /F1 2 0 R
 >>
 endobj
 2 0 obj
 <<
 /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
 >>
 endobj
 3 0 obj
 <<
 /Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
 /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >> /Rotate 0 /Trans <<
 >> 
  /Type /Page
 >>
 endobj
 4 0 obj
 <<
 /Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
 /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >> /Rotate 0 /Trans <<
 >> 
  /Type /Page
 >>
 endobj
 5 0 obj
 <<
 /Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
 /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >> /Rotate 0 /Trans <<
 >> 
  /Type /Page
 >>
 endobj
 6 0 obj
 <<
 /Contents 13 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
 /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >> /Rotate 0 /Trans <<
 >> 
  /Type /Page
 >>
 endobj
 7 0 obj
 <<
 /PageMode /UseNone /Pages 9 0 R /Type /Catalog
 >>
 endobj
 8 0 obj
 <<
 /Author (anonymous) /CreationDate (D:19800101000000+00'00') /Creator (anonymous) /Keywords () /ModDate (D:19800101000000+00'00') /Producer (ReportLab PDF Library - \(opensource\)) 
  /Subject (unspecified) /Title (untitled) /Trapped /False
 >>
 endobj
 9 0 obj
 <<
 /Count 4 /Kids [ 3 0 R 4 0 R 5 0 R 6 0 R ] /Type /Pages
 >>
 endobj
 10 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ] /Length 217
 >>
 stream
 Gas30YmS?5&;9"+:GJ\L`7rI@@Oq[]V;)ju4[h(2dJ$.fMDlYNi/6XZ9/-MBqIFpH"0bWR4+VY?&JE4dmBP4$H`s>o>Pd5_5(knN-9C@@=hbnO$/KG<T]uHC6SHeT%fQ2(61,2)kB&jPeh#ln*V7]`-(1#q7P]TrOr967OBGd6R>k'EA?N"sbgn1*RGt<48$Z/.<iqdC<HBN;BdXTjQboF?~>endstream
 endobj
 11 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ] /Length 179
 >>
 stream
 Gas30^C%h3*5qB\:N<.Pcs3$Hl<(9Sj6mHT",_O,eK?ILEeIs/+25o1W?$HFlO(jerB`1_*amY9`!,>fg-:(O.:HsM<c")brI"e6WCOT4gHTe]6:XPR3Z2,/H>lia7mi26F)k6[R>)2Tc&QO]0JmRQ33#uf(:EGYU/pYb,%W<I+0;`+EW~>endstream
 endobj
 12 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ] /Length 182
 >>
 stream
 Gas30YmS?%'SYMZ:N8jHd+m]ZXcA"(*:Fj!$As93eK>>CO@)QnnF80POP6tcHWu&Bi%Q$",OR8C45u,jFR@u"e5F01DQMJaO6&5D+&?+Z'=%F%qt`rY;O"3#"KbqRMK6*1l<JI#\QT.g>jW9fl6'd&lDQ+4eQPFB=)/[R?*6VZ`^9D([>Kog~>endstream
 endobj
 13 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ] /Length 147
 >>
 stream
 Gas3+3spL'$q8S#<P23]FJ9Y&V4a)bG2NT>h1+`5('Z%;U^2`KE+.t@o*+c<HmDMhfg)&^AATHdpsVmX3RhL!69O]%\U_jUJK0dDLK7_Y[]$?TK6gh*/?5bY6!78.Ms>%mcr*lWqbfg@lpOeX~>endstream
 endobj
 xref
 0 14
 0000000000 65535 f 
 0000000061 00000 n 
 0000000092 00000 n 
 0000000199 00000 n 
 0000000393 00000 n 
 0000000587 00000 n 
 0000000781 00000 n 
 0000000975 00000 n 
 0000001043 00000 n 
 0000001304 00000 n 
 0000001381 00000 n 
 0000001689 00000 n 
 0000001959 00000 n 
 0000002232 00000 n 
 trailer
 <<
 /ID 
 [<30157dc3b9cf65b8d1eaf3493559908e><30157dc3b9cf65b8d1eaf3493559908e>]
 % ReportLab generated PDF document -- digest (opensource)
 /Info 8 0 R
 /Root 7 0 R
 /Size 14
 >>
 startxref
 2470
 %%EOF
--- a/tests/fixtures/no_form.pdf
+++ b/tests/fixtures/no_form.pdf
@@ -0,0 +1,68 @@
 %PDF-1.3
 %“Œ‹ž ReportLab Generated PDF document (opensource)
 1 0 obj
 <<
 /F1 2 0 R
 >>
 endobj
 2 0 obj
 <<
 /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
 >>
 endobj
 3 0 obj
 <<
 /Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
 /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >> /Rotate 0 /Trans <<
 >> 
  /Type /Page
 >>
 endobj
 4 0 obj
 <<
 /PageMode /UseNone /Pages 6 0 R /Type /Catalog
 >>
 endobj
 5 0 obj
 <<
 /Author (anonymous) /CreationDate (D:19800101000000+00'00') /Creator (anonymous) /Keywords () /ModDate (D:19800101000000+00'00') /Producer (ReportLab PDF Library - \(opensource\)) 
  /Subject (unspecified) /Title (untitled) /Trapped /False
 >>
 endobj
 6 0 obj
 <<
 /Count 1 /Kids [ 3 0 R ] /Type /Pages
 >>
 endobj
 7 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ] /Length 226
 >>
 stream
 Gas2Bb6l*?&4Q?lMRuh(2(>rm;UL(=iaR@%P12s;!_o]ip\#oA:h3rL(XCuYYkiVA702`\bERWLTF<pmA'bMe$GLl8m[Gp,mCZM>`irc(:k@<Q,.1t_;U3TSGL0f4RBV`'XKta+*A74'q:3;`A;r@nl60Fm[LVPtD`E'mGib0+5kmB/Rp3p#C+&@HQ1$r/^;:dZ/#koRn*nah\!>!7PW#)X61=m`OB9!~>endstream
 endobj
 xref
 0 8
 0000000000 65535 f 
 0000000061 00000 n 
 0000000092 00000 n 
 0000000199 00000 n 
 0000000392 00000 n 
 0000000460 00000 n 
 0000000721 00000 n 
 0000000780 00000 n 
 trailer
 <<
 /ID 
 [<30157dc3b9cf65b8d1eaf3493559908e><30157dc3b9cf65b8d1eaf3493559908e>]
 % ReportLab generated PDF document -- digest (opensource)
 /Info 5 0 R
 /Root 4 0 R
 /Size 8
 >>
 startxref
 1096
 %%EOF
--- a/tests/fixtures/simple_form.pdf
+++ b/tests/fixtures/simple_form.pdf
@@ -0,0 +1,161 @@
 %PDF-1.3
 %âãÏÓ
 1 0 obj
 <<
 /Producer (pypdf)
 >>
 endobj
 2 0 obj
 <<
 /Type /Pages
 /Count 1
 /Kids [ 4 0 R ]
 >>
 endobj
 3 0 obj
 <<
 /Type /Catalog
 /Pages 2 0 R
 /AcroForm <<
 /Fields [ <<
 /FT /Tx
 /T (name)
 /V ()
 /Rect [ 200 690 400 710 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (email)
 /V ()
 /Rect [ 200 640 400 660 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (phone)
 /V ()
 /Rect [ 200 590 400 610 ]
 /Ff 0
 >> <<
 /FT /Ch
 /T (country)
 /V ()
 /Opt [ (USA) (Canada) (UK) (Germany) (France) ]
 /Rect [ 200 540 400 560 ]
 /Ff 0
 >> <<
 /FT /Tx
 /T (birth\137date)
 /V ()
 /Rect [ 200 490 400 510 ]
 /Ff 0
 >> <<
 /FT /Btn
 /T (agree\137terms)
 /V /Off
 /Rect [ 200 440 220 460 ]
 /Ff 0
 >> ]
 >>
 >>
 endobj
 4 0 obj
 <<
 /Contents 5 0 R
 /MediaBox [ 0 0 612 792 ]
 /Resources <<
 /Font 6 0 R
 /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >>
 /Rotate 0
 /Trans <<
 >>
 /Type /Page
 /Parent 2 0 R
 /Annots [ <<
 /Subtype /Widget
 /FT /Tx
 /T (name)
 /V ()
 /Rect [ 200 690 400 710 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (email)
 /V ()
 /Rect [ 200 640 400 660 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (phone)
 /V ()
 /Rect [ 200 590 400 610 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Ch
 /T (country)
 /V ()
 /Rect [ 200 540 400 560 ]
 /Ff 0
 /Opt [ (USA) (Canada) (UK) (Germany) (France) ]
 >> <<
 /Subtype /Widget
 /FT /Tx
 /T (birth\137date)
 /V ()
 /Rect [ 200 490 400 510 ]
 /Ff 0
 >> <<
 /Subtype /Widget
 /FT /Btn
 /T (agree\137terms)
 /V /Off
 /Rect [ 200 440 220 460 ]
 /Ff 0
 >> ]
 >>
 endobj
 5 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ]
 /Length 214
 >>
 stream
 Gas3/_$YcZ&-h():[oO-KC+O7Fj&337*rSs`0Q/<`k!1:qntBjLh1!*5Q?*5,9cn2L]>4V7T^E=1'1`)j"LZXOAkYndii(Rd4^iHO@!??#S:KhY5-Hn'\Y63F`n8+K,.t]c\@9%516]H[@*&9CT1O*F'1H9T&WS2DLGjN]UaM[f"?B)-YBck(&"KsZ*@fJ2kq(gmZ1he)\4'9")1e>M#~>
 endstream
 endobj
 6 0 obj
 <<
 /F1 7 0 R
 >>
 endobj
 7 0 obj
 <<
 /BaseFont /Helvetica
 /Encoding /WinAnsiEncoding
 /Name /F1
 /Subtype /Type1
 /Type /Font
 >>
 endobj
 xref
 0 8
 0000000000 65535 f 
 0000000015 00000 n 
 0000000054 00000 n 
 0000000113 00000 n 
 0000000637 00000 n 
 0000001387 00000 n 
 0000001692 00000 n 
 0000001723 00000 n 
 trailer
 <<
 /Size 8
 /Root 3 0 R
 /Info 1 0 R
 >>
 startxref
 1830
 %%EOF
--- a/tests/test_documents.py
+++ b/tests/test_documents.py
@@ -0,0 +1,464 @@
 """
 Comprehensive test suite for document-service.
 Tests document upload, retrieval, field discovery, and complete workflows.
 """
 import pytest
 import os
 from fastapi.testclient import TestClient
 from unittest.mock import Mock, patch
 from moto import mock_aws
 import boto3
 from app.main import app
 from app.pdf import discover_fields
 # Test data paths
 FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
 class TestHealthEndpoint:
    """Test health endpoint functionality."""
    def test_health_endpoint(self, test_client):
        """Test health endpoint returns 200 OK."""
        response = test_client.get("/health")
        assert response.status_code == 200
        assert response.json() == {"status": "ok"}
 class TestDocumentUpload:
    """Test document upload functionality."""
    def test_upload_simple_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
        """Test uploading a simple PDF with form fields."""
        with open(test_pdf_files["simple_form"], "rb") as f:
            files = {"file": ("simple_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        assert response.status_code == 201
        result = response.json()
        assert "document_id" in result
        assert "metadata" in result
        assert "download_url" in result
        assert result["metadata"]["document_type"] == "pdf"
        assert result["metadata"]["filename"] == "simple_form.pdf"
    def test_upload_complex_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
        """Test uploading a complex PDF with multiple field types."""
        with open(test_pdf_files["complex_form"], "rb") as f:
            files = {"file": ("complex_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        assert response.status_code == 201
        result = response.json()
        assert "document_id" in result
        assert result["metadata"]["document_type"] == "pdf"
    def test_upload_no_form_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
        """Test uploading a PDF without form fields."""
        with open(test_pdf_files["no_form"], "rb") as f:
            files = {"file": ("no_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        assert response.status_code == 201
        result = response.json()
        assert "document_id" in result
    def test_upload_without_auth_returns_401(self, test_client, test_pdf_files):
        """Test upload without auth returns 401."""
        with open(test_pdf_files["simple_form"], "rb") as f:
            files = {"file": ("simple_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data
            )
        assert response.status_code == 401
        assert "detail" in response.json()
    def test_upload_with_invalid_auth_returns_401(self, test_client, test_pdf_files):
        """Test upload with invalid auth returns 401."""
        with open(test_pdf_files["simple_form"], "rb") as f:
            files = {"file": ("simple_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": "Invalid token"}
            response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        assert response.status_code == 401
    def test_upload_missing_file_returns_400(self, test_client, sample_auth_token):
        """Test upload without file returns 400."""
        data = {"org_id": "test-org-123"}
        headers = {"Authorization": sample_auth_token}
        response = test_client.post(
            "/api/documents/upload",
            data=data,
            headers=headers
        )
        assert response.status_code == 422  # FastAPI validation error
 class TestDocumentMetadata:
    """Test document metadata retrieval."""
    def test_get_document_metadata_success(self, test_client, sample_auth_token):
        """Test getting document metadata successfully."""
        # This test would require a document to be uploaded first
        # For now, we'll test the endpoint structure
        headers = {"Authorization": sample_auth_token}
        response = test_client.get(
            "/api/documents/test-doc-456",
            params={"org_id": "test-org-123"},
            headers=headers
        )
        # Will return 404 since document doesn't exist, but endpoint is accessible
        assert response.status_code in [404, 403]
    def test_get_document_without_auth_returns_401(self, test_client):
        """Test getting document without auth returns 401."""
        response = test_client.get("/api/documents/test-doc-456")
        assert response.status_code == 401
 class TestDownloadUrl:
    """Test download URL generation."""
    def test_get_download_url_success(self, test_client, sample_auth_token):
        """Test getting download URL successfully."""
        headers = {"Authorization": sample_auth_token}
        response = test_client.get(
            "/api/documents/test-doc-456/download-url",
            params={"org_id": "test-org-123"},
            headers=headers
        )
        # Will return 404 since document doesn't exist, but endpoint is accessible
        assert response.status_code in [404, 403]
    def test_get_download_url_without_auth_returns_401(self, test_client):
        """Test getting download URL without auth returns 401."""
        response = test_client.get("/api/documents/test-doc-456/download-url")
        assert response.status_code == 401
 class TestPDFFieldDiscovery:
    """Test PDF field discovery functionality."""
    def test_get_pdf_fields_simple_form(self, test_client, test_pdf_files, sample_auth_token):
        """Test getting PDF fields from simple form."""
        # First upload the document
        with open(test_pdf_files["simple_form"], "rb") as f:
            files = {"file": ("simple_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            upload_response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        if upload_response.status_code == 201:
            document_id = upload_response.json()["document_id"]
            # Get fields
            headers = {"Authorization": sample_auth_token}
            response = test_client.get(
                f"/api/documents/{document_id}/fields",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            assert response.status_code == 200
            result = response.json()
            assert "fields" in result
            assert len(result["fields"]) == 6  # name, email, phone, country, birth_date, agree_terms
            # Check field types
            field_types = {f["field"]: f["type"] for f in result["fields"]}
            assert field_types["name"] == "string"
            assert field_types["email"] == "string"
            assert field_types["phone"] == "string"
            assert field_types["country"] == "select"
            assert field_types["birth_date"] == "date"
            assert field_types["agree_terms"] == "boolean"
    def test_get_pdf_fields_complex_form(self, test_client, test_pdf_files, sample_auth_token):
        """Test getting PDF fields from complex form."""
        # First upload the document
        with open(test_pdf_files["complex_form"], "rb") as f:
            files = {"file": ("complex_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            upload_response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        if upload_response.status_code == 201:
            document_id = upload_response.json()["document_id"]
            # Get fields
            headers = {"Authorization": sample_auth_token}
            response = test_client.get(
                f"/api/documents/{document_id}/fields",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            assert response.status_code == 200
            result = response.json()
            assert "fields" in result
            assert len(result["fields"]) == 16  # All fields from complex form
    def test_get_pdf_fields_no_form_returns_empty_list(self, test_client, test_pdf_files, sample_auth_token):
        """Test getting PDF fields from PDF without form fields."""
        # First upload the document
        with open(test_pdf_files["no_form"], "rb") as f:
            files = {"file": ("no_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            upload_response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        if upload_response.status_code == 201:
            document_id = upload_response.json()["document_id"]
            # Get fields
            headers = {"Authorization": sample_auth_token}
            response = test_client.get(
                f"/api/documents/{document_id}/fields",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            assert response.status_code == 200
            result = response.json()
            assert "fields" in result
            assert len(result["fields"]) == 0
    def test_get_pdf_fields_without_auth_returns_401(self, test_client):
        """Test getting PDF fields without auth returns 401."""
        response = test_client.get("/api/documents/test-doc-456/fields")
        assert response.status_code == 401
 class TestDocumentDeletion:
    """Test document deletion functionality."""
    def test_delete_document_success(self, test_client, sample_auth_token):
        """Test deleting document successfully."""
        headers = {"Authorization": sample_auth_token}
        response = test_client.delete(
            "/api/documents/test-doc-456",
            params={"org_id": "test-org-123"},
            headers=headers
        )
        # Will return 404 since document doesn't exist, but endpoint is accessible
        assert response.status_code in [404, 403]
    def test_delete_document_without_auth_returns_401(self, test_client):
        """Test deleting document without auth returns 401."""
        response = test_client.delete("/api/documents/test-doc-456")
        assert response.status_code == 401
 class TestPDFFieldDiscoveryDirect:
    """Test PDF field discovery directly (without API)."""
    def test_simple_form_pdf_fields(self, test_pdf_files):
        """Test field discovery on simple form PDF."""
        fields = discover_fields(test_pdf_files["simple_form"])
        assert len(fields) == 6
        field_names = [f["field"] for f in fields]
        assert "name" in field_names
        assert "email" in field_names
        assert "phone" in field_names
        assert "country" in field_names
        assert "birth_date" in field_names
        assert "agree_terms" in field_names
        # Check field types
        field_types = {f["field"]: f["type"] for f in fields}
        assert field_types["name"] == "string"
        assert field_types["email"] == "string"
        assert field_types["phone"] == "string"
        assert field_types["country"] == "select"
        assert field_types["birth_date"] == "date"
        assert field_types["agree_terms"] == "boolean"
    def test_complex_form_pdf_fields(self, test_pdf_files):
        """Test field discovery on complex form PDF."""
        fields = discover_fields(test_pdf_files["complex_form"])
        assert len(fields) == 16
        field_names = [f["field"] for f in fields]
        # Check for expected fields
        assert "first_name" in field_names
        assert "last_name" in field_names
        assert "email" in field_names
        assert "country" in field_names
        assert "gender" in field_names
        assert "agree_terms" in field_names
        assert "signature" in field_names
        # Check field types
        field_types = {f["field"]: f["type"] for f in fields}
        assert field_types["first_name"] == "string"
        assert field_types["country"] == "select"
        assert field_types["gender"] == "boolean"
        assert field_types["agree_terms"] == "boolean"
        assert field_types["signature"] == "string"
    def test_no_form_pdf_fields(self, test_pdf_files):
        """Test field discovery on PDF without form fields."""
        fields = discover_fields(test_pdf_files["no_form"])
        assert len(fields) == 0
    def test_large_form_pdf_fields(self, test_pdf_files):
        """Test field discovery on large PDF without form fields."""
        fields = discover_fields(test_pdf_files["large_form"])
        assert len(fields) == 0
    def test_pdf_field_labels_generated_correctly(self, test_pdf_files):
        """Test that field labels are generated correctly."""
        fields = discover_fields(test_pdf_files["simple_form"])
        field_labels = {f["field"]: f["label"] for f in fields}
        assert field_labels["name"] == "Name"
        assert field_labels["email"] == "Email"
        assert field_labels["phone"] == "Phone"
        assert field_labels["country"] == "Country"
        assert field_labels["birth_date"] == "Birth Date"
        assert field_labels["agree_terms"] == "Agree Terms"
    def test_pdf_field_options_extracted_correctly(self, test_pdf_files):
        """Test that dropdown options are extracted correctly."""
        fields = discover_fields(test_pdf_files["simple_form"])
        country_field = next(f for f in fields if f["field"] == "country")
        assert country_field["type"] == "select"
        assert country_field["options"] is not None
        assert len(country_field["options"]) == 5
        assert "USA" in country_field["options"]
        assert "Canada" in country_field["options"]
        assert "UK" in country_field["options"]
        assert "Germany" in country_field["options"]
        assert "France" in country_field["options"]
 class TestCompleteWorkflow:
    """Test complete document lifecycle workflows."""
    def test_complete_document_lifecycle(self, test_client, test_pdf_files, sample_auth_token):
        """Test complete document lifecycle: upload, get metadata, get fields, delete."""
        # Upload document
        with open(test_pdf_files["simple_form"], "rb") as f:
            files = {"file": ("simple_form.pdf", f, "application/pdf")}
            data = {"org_id": "test-org-123"}
            headers = {"Authorization": sample_auth_token}
            upload_response = test_client.post(
                "/api/documents/upload",
                files=files,
                data=data,
                headers=headers
            )
        if upload_response.status_code == 201:
            document_id = upload_response.json()["document_id"]
            # Get metadata
            headers = {"Authorization": sample_auth_token}
            metadata_response = test_client.get(
                f"/api/documents/{document_id}",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            # Get fields
            fields_response = test_client.get(
                f"/api/documents/{document_id}/fields",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            # Get download URL
            download_response = test_client.get(
                f"/api/documents/{document_id}/download-url",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            # Delete document
            delete_response = test_client.delete(
                f"/api/documents/{document_id}",
                params={"org_id": "test-org-123"},
                headers=headers
            )
            # Verify all operations succeeded
            assert upload_response.status_code == 201
            assert metadata_response.status_code in [200, 404]  # May be 404 if S3 not available
            assert fields_response.status_code in [200, 404]
            assert download_response.status_code in [200, 404]
            assert delete_response.status_code in [200, 404]
--- a/uv.lock
+++ b/uv.lock