Initial commit of document-service
This commit is contained in:
68
.gitea/workflows/build-and-publish.yaml
Normal file
68
.gitea/workflows/build-and-publish.yaml
Normal file
@@ -0,0 +1,68 @@
|
||||
name: Build and Publish
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
env:
|
||||
CHART_NAME: ${{ github.event.repository.name }}
|
||||
IMAGE_NAME: ${{ github.event.repository.name }}
|
||||
jobs:
|
||||
build-release:
|
||||
runs-on: nix
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build Docker Image via Nix Flake
|
||||
run: |
|
||||
nix build .#dockerImage --print-build-logs
|
||||
docker load < result
|
||||
|
||||
- name: Log in to Gitea Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ github.server_url }}
|
||||
username: ${{ secrets.CI_USER }}
|
||||
password: ${{ secrets.CI_PASSWORD }}
|
||||
|
||||
- name: Tag and Push Docker Image
|
||||
run: |
|
||||
VERSION=${{ github.run_number }}
|
||||
|
||||
# Strip https from server URL
|
||||
REGISTRY=${GITHUB_SERVER_URL#https://}
|
||||
|
||||
TARGET_IMAGE=$REGISTRY/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
# Auto-detect the built image name (better version)
|
||||
SOURCE_IMAGE=$(docker load < result | awk '{print $3}')
|
||||
|
||||
docker tag $SOURCE_IMAGE $TARGET_IMAGE:$VERSION
|
||||
docker tag $SOURCE_IMAGE $TARGET_IMAGE:latest
|
||||
docker push $TARGET_IMAGE:$VERSION
|
||||
docker push $TARGET_IMAGE:latest
|
||||
|
||||
- name: Setup Helm
|
||||
uses: azure/setup-helm@v4
|
||||
with:
|
||||
version: v3.14.0
|
||||
|
||||
- name: Package Helm Chart
|
||||
run: |
|
||||
VERSION=${{ github.run_number }}
|
||||
helm repo add bjw-s https://bjw-s-labs.github.io/helm-charts
|
||||
helm dependency build ops/chart
|
||||
helm package ops/chart --version $VERSION --app-version $VERSION
|
||||
|
||||
- name: Push Helm Chart to Gitea Registry
|
||||
run: |
|
||||
VERSION=${{ github.run_number }}
|
||||
CHART_FILE=${{ env.CHART_NAME }}-${VERSION}.tgz
|
||||
|
||||
curl -f --user "${{ secrets.CI_USER }}:${{ secrets.CI_PASSWORD }}" \
|
||||
-X POST \
|
||||
--upload-file ./$CHART_FILE \
|
||||
"${{ github.server_url }}/api/packages/${{ github.repository_owner }}/helm/api/charts"
|
||||
54
.gitignore
vendored
Normal file
54
.gitignore
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual environments
|
||||
.venv/
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Environment variables
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
|
||||
# Nix
|
||||
.direnv/
|
||||
result
|
||||
286
README.md
Normal file
286
README.md
Normal file
@@ -0,0 +1,286 @@
|
||||
# Document Service
|
||||
|
||||
Generic document management service with S3 storage and PDF field discovery.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multi-format support**: PDF, DOCX, XLSX, JPG, JPEG, PNG, GIF
|
||||
- **S3 storage**: Configurable S3-compatible storage (MinIO, AWS S3, etc.)
|
||||
- **PDF field discovery**: Extract form fields from PDF documents
|
||||
- **Organization-based access control**: Documents scoped to organizations
|
||||
- **File size limits**: Configurable per document type
|
||||
- **Content type detection**: Automatic detection using python-magic
|
||||
- **Comprehensive logging**: All operations logged for audit trail
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Upload Document
|
||||
```
|
||||
POST /api/documents/upload
|
||||
Content-Type: multipart/form-data
|
||||
Authorization: Bearer <token>
|
||||
|
||||
Form data:
|
||||
- file: (required) Document file
|
||||
- uploaded_by: (optional) User who uploaded the document
|
||||
|
||||
Response:
|
||||
{
|
||||
"document_id": "uuid",
|
||||
"metadata": {...},
|
||||
"download_url": "presigned-url"
|
||||
}
|
||||
```
|
||||
|
||||
### Rewrite Document
|
||||
```
|
||||
PUT /api/documents/{document_id}
|
||||
Content-Type: multipart/form-data
|
||||
Authorization: Bearer <token>
|
||||
|
||||
Form data:
|
||||
- file: (required) New document file
|
||||
- uploaded_by: (optional) User who uploaded the document
|
||||
|
||||
Response:
|
||||
{
|
||||
"document_id": "uuid",
|
||||
"metadata": {...},
|
||||
"download_url": "presigned-url"
|
||||
}
|
||||
```
|
||||
|
||||
### Get Document Metadata
|
||||
```
|
||||
GET /api/documents/{document_id}
|
||||
Authorization: Bearer <token>
|
||||
|
||||
Response:
|
||||
{
|
||||
"document_id": "uuid",
|
||||
"org_id": "org-id",
|
||||
"uploaded_by": "user",
|
||||
"document_type": "pdf",
|
||||
"filename": "document.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"file_size": 12345,
|
||||
"s3_key": "documents/org-id/uuid/document.pdf",
|
||||
"created_at": "2024-01-01T00:00:00",
|
||||
"updated_at": "2024-01-01T00:00:00"
|
||||
}
|
||||
```
|
||||
|
||||
### Get Download URL
|
||||
```
|
||||
GET /api/documents/{document_id}/download-url?expires_in=3600
|
||||
Authorization: Bearer <token>
|
||||
|
||||
Response:
|
||||
{
|
||||
"download_url": "presigned-url",
|
||||
"s3_key": "documents/org-id/uuid/document.pdf",
|
||||
"expires_in": 3600
|
||||
}
|
||||
```
|
||||
|
||||
### Get PDF Fields
|
||||
```
|
||||
GET /api/documents/{document_id}/fields
|
||||
Authorization: Bearer <token>
|
||||
|
||||
Response:
|
||||
{
|
||||
"document_id": "uuid",
|
||||
"document_type": "pdf",
|
||||
"fields": [
|
||||
{
|
||||
"field": "field_name",
|
||||
"label": "Field Name",
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"options": null
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Delete Document
|
||||
```
|
||||
DELETE /api/documents/{document_id}
|
||||
Authorization: Bearer <token>
|
||||
|
||||
Response:
|
||||
{
|
||||
"message": "Document deleted successfully"
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `S3_ENDPOINT` | S3 endpoint URL | `http://localhost:9000` |
|
||||
| `S3_ACCESS_KEY` | S3 access key | `minioadmin` |
|
||||
| `S3_SECRET_KEY` | S3 secret key | `minioadmin` |
|
||||
| `S3_BUCKET` | S3 bucket name | `document-bucket` |
|
||||
| `S3_REGION` | S3 region | `us-east-1` |
|
||||
| `HOST` | Service host | `0.0.0.0` |
|
||||
| `PORT` | Service port | `8082` |
|
||||
| `TEST_UPLOADER` | Default uploader for testing | `test-user` |
|
||||
| `LOG_LEVEL` | Logging level | `INFO` |
|
||||
|
||||
### File Size Limits
|
||||
|
||||
| Document Type | Default Limit |
|
||||
|---------------|---------------|
|
||||
| PDF | 50MB |
|
||||
| DOCX | 25MB |
|
||||
| XLSX | 25MB |
|
||||
| JPG/JPEG | 10MB |
|
||||
| PNG | 10MB |
|
||||
| GIF | 10MB |
|
||||
| Other | 10MB |
|
||||
|
||||
## Authentication
|
||||
|
||||
The service uses JWT tokens for authentication. The `org_id` is extracted from the token claims and used for organization-based access control.
|
||||
|
||||
**Note**: Currently, the auth middleware includes a mock implementation for testing. In production, this should be replaced with proper Zitadel integration.
|
||||
|
||||
## Development
|
||||
|
||||
### Setup
|
||||
|
||||
This project uses [uv2nix](https://pyproject-nix.github.io/uv2nix/) for reproducible Python dependency management with Nix.
|
||||
|
||||
```bash
|
||||
# Enter the development shell (uses uv2nix)
|
||||
nix develop
|
||||
|
||||
# The development shell includes:
|
||||
# - Python with all dependencies from uv.lock
|
||||
# - uv tool for package management
|
||||
# - pyright for type checking
|
||||
# - file package (provides libmagic for content type detection)
|
||||
```
|
||||
|
||||
### Running the Service
|
||||
|
||||
```bash
|
||||
# Start the development server
|
||||
uvicorn app.main:app --reload --host 0.0.0.0 --port 8082
|
||||
|
||||
# Access API documentation
|
||||
open http://localhost:8082/docs
|
||||
```
|
||||
|
||||
### Adding Dependencies
|
||||
|
||||
```bash
|
||||
# Add a new dependency
|
||||
uv add <package-name>
|
||||
|
||||
# Add a development dependency
|
||||
uv add --dev <package-name>
|
||||
|
||||
# Update the lock file
|
||||
uv lock
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
# Run tests
|
||||
pytest
|
||||
|
||||
# Run with coverage
|
||||
pytest --cov=app
|
||||
```
|
||||
|
||||
### Linting
|
||||
|
||||
```bash
|
||||
# Run ruff
|
||||
ruff check app/
|
||||
|
||||
# Format code
|
||||
ruff format app/
|
||||
```
|
||||
|
||||
### Building Production Package
|
||||
|
||||
```bash
|
||||
# Build the production package
|
||||
nix build
|
||||
|
||||
# The package will be available at ./result
|
||||
```
|
||||
|
||||
## Deployment
|
||||
|
||||
### Using Helm
|
||||
|
||||
```bash
|
||||
# Install chart
|
||||
helm install document-service ./ops/chart
|
||||
|
||||
# Upgrade chart
|
||||
helm upgrade document-service ./ops/chart
|
||||
|
||||
# Uninstall
|
||||
helm uninstall document-service
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Edit `ops/chart/values.yaml` to customize deployment settings.
|
||||
|
||||
## S3 Path Structure
|
||||
|
||||
Documents are stored in S3 using the following path structure:
|
||||
|
||||
```
|
||||
documents/{org_id}/{document_id}/{filename}
|
||||
```
|
||||
|
||||
Example:
|
||||
```
|
||||
documents/org-123/abc-456-def-789/policy_document.pdf
|
||||
```
|
||||
|
||||
## Logging
|
||||
|
||||
All operations are logged with the following information:
|
||||
- Operation type (upload, download, delete, etc.)
|
||||
- Document ID
|
||||
- Organization ID
|
||||
- User ID
|
||||
- Timestamp
|
||||
- Success/failure status
|
||||
|
||||
## Error Handling
|
||||
|
||||
The service returns appropriate HTTP status codes:
|
||||
|
||||
- `200` - Success
|
||||
- `201` - Created
|
||||
- `400` - Bad Request
|
||||
- `401` - Unauthorized
|
||||
- `403` - Forbidden
|
||||
- `404` - Not Found
|
||||
- `413` - Payload Too Large (file size exceeded)
|
||||
- `415` - Unsupported Media Type
|
||||
- `500` - Internal Server Error
|
||||
|
||||
## TODO
|
||||
|
||||
- [ ] Implement proper Zitadel authentication
|
||||
- [ ] Add document listing endpoint
|
||||
- [ ] Add document search functionality
|
||||
- [ ] Add document versioning support
|
||||
- [ ] Add document conversion capabilities
|
||||
- [ ] Add comprehensive test coverage
|
||||
- [ ] Add API rate limiting
|
||||
- [ ] Add metrics and monitoring
|
||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
31
app/config.py
Normal file
31
app/config.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# S3 settings
|
||||
s3_endpoint: str = "http://localhost:9000"
|
||||
s3_access_key: str = "minioadmin"
|
||||
s3_secret_key: str = "minioadmin"
|
||||
s3_bucket: str = "document-bucket"
|
||||
s3_region: str = "us-east-1"
|
||||
|
||||
# Service settings
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8082
|
||||
|
||||
# File size limits (bytes)
|
||||
max_file_size_pdf: int = 50 * 1024 * 1024 # 50MB
|
||||
max_file_size_docx: int = 25 * 1024 * 1024 # 25MB
|
||||
max_file_size_xlsx: int = 25 * 1024 * 1024 # 25MB
|
||||
max_file_size_jpg: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_jpeg: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_png: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_gif: int = 10 * 1024 * 1024 # 10MB
|
||||
max_file_size_default: int = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
# Logging
|
||||
log_level: str = "INFO"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
||||
settings = Settings()
|
||||
38
app/enums.py
Normal file
38
app/enums.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from enum import Enum
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
PDF = "pdf"
|
||||
DOCX = "docx"
|
||||
XLSX = "xlsx"
|
||||
JPG = "jpg"
|
||||
JPEG = "jpeg"
|
||||
PNG = "png"
|
||||
GIF = "gif"
|
||||
|
||||
@classmethod
|
||||
def from_mime_type(cls, mime_type: str) -> "DocumentType":
|
||||
"""Map MIME type to DocumentType"""
|
||||
mapping = {
|
||||
"application/pdf": cls.PDF,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": cls.DOCX,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": cls.XLSX,
|
||||
"image/jpeg": cls.JPG,
|
||||
"image/png": cls.PNG,
|
||||
"image/gif": cls.GIF,
|
||||
}
|
||||
return mapping.get(mime_type.lower())
|
||||
|
||||
@classmethod
|
||||
def from_extension(cls, filename: str) -> "DocumentType":
|
||||
"""Map file extension to DocumentType"""
|
||||
ext = filename.split(".")[-1].lower()
|
||||
mapping = {
|
||||
"pdf": cls.PDF,
|
||||
"docx": cls.DOCX,
|
||||
"xlsx": cls.XLSX,
|
||||
"jpg": cls.JPG,
|
||||
"jpeg": cls.JPEG,
|
||||
"png": cls.PNG,
|
||||
"gif": cls.GIF,
|
||||
}
|
||||
return mapping.get(ext)
|
||||
13
app/logger.py
Normal file
13
app/logger.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import logging
|
||||
from app.config import settings
|
||||
|
||||
def setup_logging():
|
||||
"""Setup logging configuration"""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, settings.log_level.upper()),
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""Get logger with specified name"""
|
||||
return logging.getLogger(name)
|
||||
82
app/main.py
Normal file
82
app/main.py
Normal file
@@ -0,0 +1,82 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.openapi.utils import get_openapi
|
||||
from app.routers import documents
|
||||
from app.config import settings
|
||||
from app.logger import setup_logging
|
||||
from app.middleware.auth import AuthMiddleware
|
||||
|
||||
# Setup logging
|
||||
setup_logging()
|
||||
|
||||
app = FastAPI(
|
||||
title="Document Service",
|
||||
version="1.0.0",
|
||||
description="Generic document management service with S3 storage and PDF field discovery",
|
||||
openapi_url="/openapi3.json",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc"
|
||||
)
|
||||
|
||||
# Add auth middleware
|
||||
app.add_middleware(AuthMiddleware)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["http://localhost:3000"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
app.include_router(documents.router)
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Run startup tasks.
|
||||
|
||||
Raises:
|
||||
Exception: If S3 bucket initialization fails (service will fail to start)
|
||||
"""
|
||||
from app import s3
|
||||
from app.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger.info("Starting up document service...")
|
||||
|
||||
try:
|
||||
s3.ensure_bucket_exists()
|
||||
logger.info("S3 bucket initialization complete")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize S3 bucket: {e}")
|
||||
# Re-raise to fail startup
|
||||
raise
|
||||
|
||||
@app.get("/health", tags=["health"])
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.get("/health/ready", tags=["health"])
|
||||
def health_ready():
|
||||
"""Health check for Kubernetes readiness probes."""
|
||||
return {"status": "ready"}
|
||||
|
||||
def custom_openapi():
|
||||
if app.openapi_schema:
|
||||
return app.openapi_schema
|
||||
|
||||
schema = get_openapi(
|
||||
title="Document Service",
|
||||
version="1.0.0",
|
||||
openapi_version="3.1.0",
|
||||
description="Generic document management service with S3 storage and PDF field discovery",
|
||||
routes=app.routes
|
||||
)
|
||||
|
||||
schema["servers"] = [
|
||||
{"url": "http://localhost:8082", "description": "Local dev"}
|
||||
]
|
||||
|
||||
app.openapi_schema = schema
|
||||
return app.openapi_schema
|
||||
|
||||
app.openapi = custom_openapi
|
||||
0
app/middleware/__init__.py
Normal file
0
app/middleware/__init__.py
Normal file
16
app/middleware/auth.py
Normal file
16
app/middleware/auth.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from fastapi import Request
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.responses import JSONResponse
|
||||
from app.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class AuthMiddleware(BaseHTTPMiddleware):
|
||||
async def dispatch(self, request: Request, call_next):
|
||||
# Skip auth for health endpoint
|
||||
if request.url.path == "/health":
|
||||
return await call_next(request)
|
||||
request.state.org_id = "test"
|
||||
response = await call_next(request)
|
||||
return response
|
||||
|
||||
30
app/models.py
Normal file
30
app/models.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from app.enums import DocumentType
|
||||
|
||||
class DocumentMetadata(BaseModel):
|
||||
document_id: str = Field(..., description="UUID of the document")
|
||||
org_id: str = Field(..., description="Organization ID")
|
||||
document_type: DocumentType = Field(..., description="Type of document")
|
||||
filename: str = Field(..., description="Original filename")
|
||||
content_type: str = Field(..., description="MIME type")
|
||||
file_size: int = Field(..., description="File size in bytes")
|
||||
s3_key: str = Field(..., description="S3 key for the document")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
class UploadResponse(BaseModel):
|
||||
document_id: str
|
||||
metadata: DocumentMetadata
|
||||
download_url: str
|
||||
|
||||
class DownloadUrlResponse(BaseModel):
|
||||
download_url: str
|
||||
s3_key: str
|
||||
expires_in: int
|
||||
|
||||
class FieldsResponse(BaseModel):
|
||||
document_id: str
|
||||
document_type: DocumentType
|
||||
fields: list[dict]
|
||||
105
app/pdf.py
Normal file
105
app/pdf.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import os
|
||||
from pypdf import PdfReader
|
||||
from typing import Any
|
||||
|
||||
def discover_fields(pdf_path: str) -> list[dict]:
|
||||
"""
|
||||
Introspect a PDF and return all fillable AcroForm fields.
|
||||
Handles any form of AcroForm structure.
|
||||
"""
|
||||
reader = PdfReader(pdf_path)
|
||||
|
||||
# Try multiple methods to get fields
|
||||
fields = None
|
||||
|
||||
# Method 1: Try get_fields() first
|
||||
try:
|
||||
fields = reader.get_fields()
|
||||
except Exception as e:
|
||||
print(f"get_fields() failed: {e}")
|
||||
fields = None
|
||||
|
||||
# Method 2: Try to get fields from AcroForm directly
|
||||
if not fields:
|
||||
try:
|
||||
if "/AcroForm" in reader.trailer["/Root"]:
|
||||
acroform = reader.trailer["/Root"]["/AcroForm"]
|
||||
if "/Fields" in acroform:
|
||||
fields = {}
|
||||
field_array = acroform["/Fields"]
|
||||
for field_ref in field_array:
|
||||
try:
|
||||
field_obj = field_ref.get_object()
|
||||
field_name = field_obj.get("/T", "")
|
||||
if field_name:
|
||||
fields[field_name] = field_obj
|
||||
except Exception as e:
|
||||
print(f"Error processing field: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Direct AcroForm access failed: {e}")
|
||||
fields = None
|
||||
|
||||
# Method 3: Try to get fields from page annotations
|
||||
if not fields:
|
||||
try:
|
||||
fields = {}
|
||||
for page in reader.pages:
|
||||
if "/Annots" in page:
|
||||
for annot in page["/Annots"]:
|
||||
try:
|
||||
annot_obj = annot.get_object()
|
||||
if "/Subtype" in annot_obj and annot_obj["/Subtype"] == "/Widget":
|
||||
field_name = annot_obj.get("/T", "")
|
||||
if field_name and field_name not in fields:
|
||||
fields[field_name] = annot_obj
|
||||
except Exception as e:
|
||||
print(f"Error processing annotation: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Page annotation access failed: {e}")
|
||||
fields = None
|
||||
|
||||
if not fields:
|
||||
return []
|
||||
|
||||
result = []
|
||||
for field_name, field_obj in fields.items():
|
||||
try:
|
||||
field_type = field_obj.get("/FT", "")
|
||||
options = []
|
||||
|
||||
# /Ch = choice field (select/dropdown)
|
||||
if field_type == "/Ch":
|
||||
opt = field_obj.get("/Opt", [])
|
||||
if opt:
|
||||
options = [o if isinstance(o, str) else o[1] for o in opt]
|
||||
|
||||
result.append({
|
||||
"field": field_name,
|
||||
"label": field_name.replace("_", " ").title(),
|
||||
"type": _map_field_type(field_type, field_obj),
|
||||
"required": False,
|
||||
"options": options if options else None
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error processing field {field_name}: {e}")
|
||||
continue
|
||||
|
||||
return result
|
||||
|
||||
def _map_field_type(ft: str, field_obj: dict) -> str:
|
||||
mapping = {
|
||||
"/Tx": "string",
|
||||
"/Btn": "boolean",
|
||||
"/Ch": "select",
|
||||
"/Sig": "string"
|
||||
}
|
||||
base = mapping.get(ft, "string")
|
||||
|
||||
# Check if it's a date field by name hint
|
||||
field_name = field_obj.get("/T", "").lower()
|
||||
if any(hint in field_name for hint in ["date", "fecha", "birth", "nacimiento"]):
|
||||
return "date"
|
||||
|
||||
return base
|
||||
1
app/routers/__init__.py
Normal file
1
app/routers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from app.routers import documents
|
||||
355
app/routers/documents.py
Normal file
355
app/routers/documents.py
Normal file
@@ -0,0 +1,355 @@
|
||||
import os
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from app import s3, pdf, utils
|
||||
from app.config import settings
|
||||
from app.enums import DocumentType
|
||||
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
||||
from app.logger import get_logger
|
||||
|
||||
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@router.post("/upload", response_model=UploadResponse)
|
||||
async def upload_document(
|
||||
request: Request,
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""Upload a new document"""
|
||||
org_id = request.state.org_id
|
||||
user_id = getattr(request.state, "user_id", "system")
|
||||
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
|
||||
|
||||
# Detect content type
|
||||
detected_content_type = utils.detect_content_type(file)
|
||||
logger.info(f"Detected content type: {detected_content_type}")
|
||||
|
||||
# Detect document type
|
||||
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||
if not document_type:
|
||||
logger.error(f"Unsupported document type: {file.filename}")
|
||||
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||
|
||||
# Get file size
|
||||
file.file.seek(0, os.SEEK_END)
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0)
|
||||
|
||||
# Validate file size
|
||||
utils.validate_file_size(file_size, document_type)
|
||||
|
||||
# Generate document ID and S3 key
|
||||
document_id = utils.generate_document_id()
|
||||
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||
|
||||
# Prepare metadata
|
||||
metadata_dict = {
|
||||
"org_id": org_id,
|
||||
"document_type": document_type.value,
|
||||
"filename": file.filename,
|
||||
"file_size": str(file_size),
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Upload to S3
|
||||
try:
|
||||
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||
logger.info(f"File uploaded successfully: {s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key)
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=org_id,
|
||||
document_type=document_type,
|
||||
filename=file.filename,
|
||||
content_type=detected_content_type,
|
||||
file_size=file_size,
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.utcnow(),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
logger.info(f"Upload completed - document_id: {document_id}")
|
||||
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||
|
||||
@router.put("/{document_id}", response_model=UploadResponse)
|
||||
async def rewrite_document(
|
||||
request: Request,
|
||||
document_id: str,
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""Rewrite/replace an existing document"""
|
||||
org_id = request.state.org_id
|
||||
user_id = getattr(request.state, "user_id", "system")
|
||||
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
|
||||
|
||||
# Detect content type
|
||||
detected_content_type = utils.detect_content_type(file)
|
||||
|
||||
# Detect document type
|
||||
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||
if not document_type:
|
||||
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||
|
||||
# Get file size
|
||||
file.file.seek(0, os.SEEK_END)
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0)
|
||||
|
||||
# Validate file size
|
||||
utils.validate_file_size(file_size, document_type)
|
||||
|
||||
# Generate S3 key
|
||||
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||
|
||||
# Check if document exists
|
||||
if not s3.file_exists(s3_key):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Verify org_id matches
|
||||
existing_metadata = s3.get_file_metadata(s3_key)
|
||||
if existing_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Prepare metadata
|
||||
metadata_dict = {
|
||||
"org_id": org_id,
|
||||
"document_type": document_type.value,
|
||||
"filename": file.filename,
|
||||
"file_size": str(file_size),
|
||||
"updated_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Upload to S3 (overwrites existing)
|
||||
try:
|
||||
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||
logger.info(f"File rewritten successfully: {s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rewrite file: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key)
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=org_id,
|
||||
document_type=document_type,
|
||||
filename=file.filename,
|
||||
content_type=detected_content_type,
|
||||
file_size=file_size,
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
logger.info(f"Rewrite completed - document_id: {document_id}")
|
||||
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||
|
||||
@router.get("/{document_id}", response_model=DocumentMetadata)
|
||||
async def get_document(request: Request, document_id: str):
|
||||
"""Get document metadata"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Get metadata from S3
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
|
||||
# Verify org_id matches
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Get object info
|
||||
try:
|
||||
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get object info: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
# Create metadata response
|
||||
metadata = DocumentMetadata(
|
||||
document_id=document_id,
|
||||
org_id=s3_metadata.get("org_id"),
|
||||
document_type=DocumentType(s3_metadata.get("document_type")),
|
||||
filename=s3_metadata.get("filename"),
|
||||
content_type=object_info.get("ContentType"),
|
||||
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
|
||||
s3_key=s3_key,
|
||||
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
|
||||
)
|
||||
|
||||
logger.info(f"Get document completed - document_id: {document_id}")
|
||||
return metadata
|
||||
|
||||
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
|
||||
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
|
||||
"""Get presigned download URL"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Verify org_id matches
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Generate download URL
|
||||
download_url = s3.presigned_download_url(s3_key, expires_in)
|
||||
|
||||
logger.info(f"Download URL generated - document_id: {document_id}")
|
||||
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
|
||||
|
||||
@router.get("/{document_id}/fields", response_model=FieldsResponse)
|
||||
async def get_document_fields(request: Request, document_id: str):
|
||||
"""Get PDF form fields (PDF only)"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Get metadata
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
|
||||
# Verify org_id matches
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Check if PDF
|
||||
document_type = s3_metadata.get("document_type")
|
||||
if document_type != DocumentType.PDF.value:
|
||||
logger.error(f"Document is not PDF: {document_type}")
|
||||
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
||||
|
||||
# Download and discover fields
|
||||
try:
|
||||
pdf_path = s3.download_to_temp(s3_key)
|
||||
fields = pdf.discover_fields(pdf_path)
|
||||
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to discover fields: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
|
||||
finally:
|
||||
if os.path.exists(pdf_path):
|
||||
os.unlink(pdf_path)
|
||||
|
||||
return FieldsResponse(
|
||||
document_id=document_id,
|
||||
document_type=DocumentType.PDF,
|
||||
fields=fields
|
||||
)
|
||||
|
||||
@router.delete("/{document_id}")
|
||||
async def delete_document(request: Request, document_id: str):
|
||||
"""Delete document"""
|
||||
org_id = request.state.org_id
|
||||
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
|
||||
|
||||
# List objects to find the document
|
||||
client = s3.get_client()
|
||||
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||
|
||||
try:
|
||||
response = client.list_objects_v2(
|
||||
Bucket=settings.s3_bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list objects: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||
|
||||
if not response.get("Contents"):
|
||||
logger.error(f"Document not found: {document_id}")
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
s3_key = response["Contents"][0]["Key"]
|
||||
|
||||
# Verify org_id matches
|
||||
s3_metadata = s3.get_file_metadata(s3_key)
|
||||
if s3_metadata.get("org_id") != org_id:
|
||||
logger.error(f"Organization mismatch for document: {document_id}")
|
||||
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||
|
||||
# Delete from S3
|
||||
try:
|
||||
s3.delete_file(s3_key)
|
||||
logger.info(f"Document deleted - document_id: {document_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete document: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
||||
|
||||
return {"message": "Document deleted successfully"}
|
||||
101
app/s3.py
Normal file
101
app/s3.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import boto3
|
||||
import tempfile
|
||||
import os
|
||||
from botocore.client import Config
|
||||
from fastapi import UploadFile
|
||||
from app.config import settings
|
||||
from app.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def get_client():
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=settings.s3_endpoint,
|
||||
aws_access_key_id=settings.s3_access_key,
|
||||
aws_secret_access_key=settings.s3_secret_key,
|
||||
config=Config(signature_version="s3v4"),
|
||||
region_name=settings.s3_region
|
||||
)
|
||||
|
||||
def ensure_bucket_exists() -> None:
|
||||
"""Ensure the S3 bucket exists, create it if it doesn't exist.
|
||||
|
||||
Raises:
|
||||
Exception: If bucket creation fails (service will fail to start)
|
||||
"""
|
||||
client = get_client()
|
||||
try:
|
||||
client.head_bucket(Bucket=settings.s3_bucket)
|
||||
logger.info(f"Bucket '{settings.s3_bucket}' already exists")
|
||||
except client.exceptions.ClientError as e:
|
||||
error_code = e.response['Error']['Code']
|
||||
if error_code == '404':
|
||||
try:
|
||||
client.create_bucket(
|
||||
Bucket=settings.s3_bucket,
|
||||
CreateBucketConfiguration={
|
||||
'LocationConstraint': settings.s3_region
|
||||
}
|
||||
)
|
||||
logger.info(f"Created bucket '{settings.s3_bucket}'")
|
||||
except Exception as create_error:
|
||||
logger.error(f"Failed to create bucket '{settings.s3_bucket}': {create_error}")
|
||||
raise
|
||||
else:
|
||||
logger.error(f"Error checking bucket: {e}")
|
||||
raise
|
||||
|
||||
def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
|
||||
"""Upload file to S3 with metadata"""
|
||||
client = get_client()
|
||||
|
||||
extra_args = {"ContentType": content_type}
|
||||
if metadata:
|
||||
extra_args["Metadata"] = metadata
|
||||
|
||||
client.upload_fileobj(
|
||||
file.file,
|
||||
settings.s3_bucket,
|
||||
s3_key,
|
||||
ExtraArgs=extra_args
|
||||
)
|
||||
return s3_key
|
||||
|
||||
def delete_file(s3_key: str) -> None:
|
||||
"""Delete file from S3"""
|
||||
client = get_client()
|
||||
client.delete_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
|
||||
def file_exists(s3_key: str) -> bool:
|
||||
"""Check if file exists in S3"""
|
||||
client = get_client()
|
||||
try:
|
||||
client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
return True
|
||||
except client.exceptions.ClientError:
|
||||
return False
|
||||
|
||||
def get_file_metadata(s3_key: str) -> dict:
|
||||
"""Get file metadata from S3"""
|
||||
client = get_client()
|
||||
response = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||
return response.get("Metadata", {})
|
||||
|
||||
def download_to_temp(s3_key: str) -> str:
|
||||
"""Download file from S3 to temp file"""
|
||||
client = get_client()
|
||||
suffix = os.path.splitext(s3_key)[-1] or ".tmp"
|
||||
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
||||
client.download_fileobj(settings.s3_bucket, s3_key, tmp)
|
||||
tmp.close()
|
||||
return tmp.name
|
||||
|
||||
def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
|
||||
"""Generate presigned download URL"""
|
||||
client = get_client()
|
||||
return client.generate_presigned_url(
|
||||
"get_object",
|
||||
Params={"Bucket": settings.s3_bucket, "Key": s3_key},
|
||||
ExpiresIn=expires_in
|
||||
)
|
||||
66
app/utils.py
Normal file
66
app/utils.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import uuid
|
||||
import magic
|
||||
from fastapi import HTTPException, UploadFile
|
||||
from app.config import settings
|
||||
from app.enums import DocumentType
|
||||
|
||||
def generate_document_id() -> str:
|
||||
"""Generate UUID for document"""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
def s3_path_prefix(org_id: str, document_id: str) -> str:
|
||||
"""Generate S3 path prefix for document operations"""
|
||||
return f"documents/{org_id}/{document_id}/"
|
||||
|
||||
def detect_content_type(file: UploadFile) -> str:
|
||||
"""Detect content type using python-magic"""
|
||||
file.file.seek(0)
|
||||
content = file.file.read(2048)
|
||||
file.file.seek(0)
|
||||
|
||||
mime = magic.Magic(mime=True)
|
||||
return mime.from_buffer(content)
|
||||
|
||||
def detect_document_type(filename: str, content_type: str) -> DocumentType:
|
||||
"""Detect document type from filename and content type"""
|
||||
# Try content type first
|
||||
doc_type = DocumentType.from_mime_type(content_type)
|
||||
if doc_type:
|
||||
return doc_type
|
||||
|
||||
# Fall back to extension
|
||||
return DocumentType.from_extension(filename)
|
||||
|
||||
def get_file_size_limit(document_type: DocumentType) -> int:
|
||||
"""Get max file size for document type"""
|
||||
limits = {
|
||||
DocumentType.PDF: settings.max_file_size_pdf,
|
||||
DocumentType.DOCX: settings.max_file_size_docx,
|
||||
DocumentType.XLSX: settings.max_file_size_xlsx,
|
||||
DocumentType.JPG: settings.max_file_size_jpg,
|
||||
DocumentType.JPEG: settings.max_file_size_jpeg,
|
||||
DocumentType.PNG: settings.max_file_size_png,
|
||||
DocumentType.GIF: settings.max_file_size_gif,
|
||||
}
|
||||
return limits.get(document_type, settings.max_file_size_default)
|
||||
|
||||
def validate_file_size(file_size: int, document_type: DocumentType) -> None:
|
||||
"""Validate file size against limits"""
|
||||
max_size = get_file_size_limit(document_type)
|
||||
if file_size > max_size:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File size {file_size} exceeds maximum {max_size} for {document_type.value}"
|
||||
)
|
||||
|
||||
def document_s3_key(org_id: str, document_id: str, filename: str) -> str:
|
||||
"""Generate S3 key for document"""
|
||||
return f"{s3_path_prefix(org_id, document_id)}{filename}"
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""Sanitize filename for S3"""
|
||||
# Remove path separators and special characters
|
||||
filename = filename.replace("/", "_").replace("\\", "_")
|
||||
# Keep only safe characters
|
||||
safe_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_")
|
||||
return "".join(c for c in filename if c in safe_chars)
|
||||
99
flake.lock
generated
Normal file
99
flake.lock
generated
Normal file
@@ -0,0 +1,99 @@
|
||||
{
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1776548001,
|
||||
"narHash": "sha256-ZSK0NL4a1BwVbbTBoSnWgbJy9HeZFXLYQizjb2DPF24=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "b12141ef619e0a9c1c84dc8c684040326f27cdcc",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"pyproject-build-systems": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
],
|
||||
"pyproject-nix": [
|
||||
"pyproject-nix"
|
||||
],
|
||||
"uv2nix": [
|
||||
"uv2nix"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1776659114,
|
||||
"narHash": "sha256-qapCOQmR++yZSY43dzrp3wCrkOTLpod+ONtJWBk6iKU=",
|
||||
"owner": "pyproject-nix",
|
||||
"repo": "build-system-pkgs",
|
||||
"rev": "ffaa2161dd5d63e0e94591f86b54fc239660fb2e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "pyproject-nix",
|
||||
"repo": "build-system-pkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"pyproject-nix": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1776715674,
|
||||
"narHash": "sha256-Gs1VnEkCkkRZxJQAC/Dhz0Jbfi22mFXChbtNg9w/Ybg=",
|
||||
"owner": "pyproject-nix",
|
||||
"repo": "pyproject.nix",
|
||||
"rev": "69f57f27e52a87c54e28138a75ec741cd46663c9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "pyproject-nix",
|
||||
"repo": "pyproject.nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs",
|
||||
"pyproject-build-systems": "pyproject-build-systems",
|
||||
"pyproject-nix": "pyproject-nix",
|
||||
"uv2nix": "uv2nix"
|
||||
}
|
||||
},
|
||||
"uv2nix": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
],
|
||||
"pyproject-nix": [
|
||||
"pyproject-nix"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1776718528,
|
||||
"narHash": "sha256-XeGmo/BhkFXd8vVyendr3X4mQmw7CEkeQcpy7AHbVcg=",
|
||||
"owner": "pyproject-nix",
|
||||
"repo": "uv2nix",
|
||||
"rev": "60982c30e16db3e0cba6c0ed13f0894b06ab2bf1",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "pyproject-nix",
|
||||
"repo": "uv2nix",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
143
flake.nix
Normal file
143
flake.nix
Normal file
@@ -0,0 +1,143 @@
|
||||
{
|
||||
description = "document-service using uv2nix";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||
|
||||
pyproject-nix = {
|
||||
url = "github:pyproject-nix/pyproject.nix";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
uv2nix = {
|
||||
url = "github:pyproject-nix/uv2nix";
|
||||
inputs.pyproject-nix.follows = "pyproject-nix";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
pyproject-build-systems = {
|
||||
url = "github:pyproject-nix/build-system-pkgs";
|
||||
inputs.pyproject-nix.follows = "pyproject-nix";
|
||||
inputs.uv2nix.follows = "uv2nix";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
|
||||
outputs =
|
||||
{
|
||||
nixpkgs,
|
||||
pyproject-nix,
|
||||
uv2nix,
|
||||
pyproject-build-systems,
|
||||
...
|
||||
}:
|
||||
let
|
||||
inherit (nixpkgs) lib;
|
||||
forAllSystems = lib.genAttrs lib.systems.flakeExposed;
|
||||
|
||||
workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
|
||||
|
||||
overlay = workspace.mkPyprojectOverlay {
|
||||
sourcePreference = "wheel";
|
||||
};
|
||||
|
||||
editableOverlay = workspace.mkEditablePyprojectOverlay {
|
||||
root = "$REPO_ROOT";
|
||||
};
|
||||
|
||||
pythonSets = forAllSystems (
|
||||
system:
|
||||
let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
python = pkgs.python3;
|
||||
in
|
||||
(pkgs.callPackage pyproject-nix.build.packages {
|
||||
inherit python;
|
||||
}).overrideScope
|
||||
(
|
||||
lib.composeManyExtensions [
|
||||
pyproject-build-systems.overlays.wheel
|
||||
overlay
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
in
|
||||
{
|
||||
devShells = forAllSystems (
|
||||
system:
|
||||
let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
pythonSet = pythonSets.${system}.overrideScope editableOverlay;
|
||||
virtualenv = pythonSet.mkVirtualEnv "document-service-dev-env" workspace.deps.all;
|
||||
in
|
||||
{
|
||||
default = pkgs.mkShell {
|
||||
packages = [
|
||||
virtualenv
|
||||
pkgs.uv
|
||||
pkgs.pyright
|
||||
pkgs.file
|
||||
];
|
||||
env = {
|
||||
UV_NO_SYNC = "1";
|
||||
UV_PYTHON = pythonSet.python.interpreter;
|
||||
UV_PYTHON_DOWNLOADS = "never";
|
||||
LD_LIBRARY_PATH = "${pkgs.file.out}/lib:$LD_LIBRARY_PATH";
|
||||
};
|
||||
shellHook = ''
|
||||
unset PYTHONPATH
|
||||
export REPO_ROOT=$(git rev-parse --show-toplevel)
|
||||
'';
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
packages = forAllSystems (system: let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
pythonSet = pythonSets.${system}.overrideScope editableOverlay;
|
||||
virtualenv = pythonSet.mkVirtualEnv "document-service-env" workspace.deps.default;
|
||||
|
||||
# Create a derivation that includes the application code
|
||||
appCode = pkgs.stdenv.mkDerivation {
|
||||
name = "document-service-code";
|
||||
src = ./.;
|
||||
installPhase = ''
|
||||
mkdir -p $out/app
|
||||
cp -r app/* $out/app/
|
||||
cp pyproject.toml $out/
|
||||
'';
|
||||
};
|
||||
in {
|
||||
default = virtualenv;
|
||||
dockerImage = pkgs.dockerTools.buildLayeredImage {
|
||||
name = "document-service";
|
||||
contents = [
|
||||
virtualenv
|
||||
pkgs.bashInteractive
|
||||
pkgs.busybox
|
||||
pkgs.shadow
|
||||
pkgs.file
|
||||
pkgs.git # Include git for version info
|
||||
appCode # Include application code
|
||||
];
|
||||
config = {
|
||||
Cmd = ["/bin/python" "-m" "uvicorn" "app.main:app" "--host" "0.0.0.0" "--port" "8082"];
|
||||
Env = [
|
||||
"PYTHONUNBUFFERED=1"
|
||||
"PYTHONPATH=/app"
|
||||
"S3_ENDPOINT"
|
||||
"S3_ACCESS_KEY"
|
||||
"S3_SECRET_KEY"
|
||||
"S3_BUCKET"
|
||||
"S3_REGION"
|
||||
"HOST"
|
||||
"PORT"
|
||||
"LOG_LEVEL"
|
||||
];
|
||||
WorkingDir = "/app";
|
||||
};
|
||||
};
|
||||
});
|
||||
};
|
||||
}
|
||||
14
ops/chart/Chart.yaml
Normal file
14
ops/chart/Chart.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: v2
|
||||
name: document-service
|
||||
description: Generic document management service
|
||||
type: application
|
||||
version: 1.0.0
|
||||
appVersion: "1.0.0"
|
||||
keywords:
|
||||
- python
|
||||
- fastapi
|
||||
- document-management
|
||||
dependencies:
|
||||
- name: common
|
||||
version: "4.6.2"
|
||||
repository: https://bjw-s-labs.github.io/helm-charts/
|
||||
78
ops/chart/values.yaml
Normal file
78
ops/chart/values.yaml
Normal file
@@ -0,0 +1,78 @@
|
||||
controllers:
|
||||
main:
|
||||
enabled: true
|
||||
type: deployment
|
||||
replicas: 1
|
||||
containers:
|
||||
main:
|
||||
image:
|
||||
repository: gitea.corredorconect.com/software-engineering/document-service
|
||||
tag: '{{ $.Chart.AppVersion }}'
|
||||
env:
|
||||
LOG_LEVEL: info
|
||||
PORT: "8082"
|
||||
S3_ENDPOINT:
|
||||
value: "http://minio:9000"
|
||||
S3_ACCESS_KEY:
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: '{{ include "bjw-s.common.lib.chart.names.fullname" $ }}-secrets'
|
||||
key: s3AccessKey
|
||||
S3_SECRET_KEY:
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-secrets'
|
||||
key: s3SecretKey
|
||||
S3_BUCKET:
|
||||
value: "document-bucket"
|
||||
S3_REGION:
|
||||
value: "us-east-1"
|
||||
probes:
|
||||
liveness:
|
||||
enabled: true
|
||||
custom: true
|
||||
spec:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8082
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
readiness:
|
||||
enabled: true
|
||||
custom: true
|
||||
spec:
|
||||
httpGet:
|
||||
path: /health/ready
|
||||
port: 8082
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
service:
|
||||
main:
|
||||
controller: main
|
||||
type: ClusterIP
|
||||
ports:
|
||||
http:
|
||||
port: 8082
|
||||
protocol: HTTP
|
||||
|
||||
external-secret:
|
||||
enabled: true
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
suffix: secrets
|
||||
spec:
|
||||
spec:
|
||||
refreshInterval: 0s
|
||||
secretStoreRef:
|
||||
name: cluster-secrets-store
|
||||
kind: ClusterSecretStore
|
||||
target:
|
||||
name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-secrets'
|
||||
creationPolicy: Owner
|
||||
dataFrom:
|
||||
- sourceRef:
|
||||
generatorRef:
|
||||
apiVersion: generators.external-secrets.io/v1alpha1
|
||||
kind: Password
|
||||
name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-password-generator'
|
||||
32
pyproject.toml
Normal file
32
pyproject.toml
Normal file
@@ -0,0 +1,32 @@
|
||||
[project]
|
||||
name = "document-service"
|
||||
version = "1.0.0"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"fastapi>=0.115.0",
|
||||
"uvicorn[standard]>=0.30.0",
|
||||
"pypdf>=4.3.1",
|
||||
"boto3>=1.35.0",
|
||||
"python-multipart>=0.0.9",
|
||||
"pydantic>=2.8.0",
|
||||
"pydantic-settings>=2.4.0",
|
||||
"python-magic>=0.4.27",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling", "editables"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["app"]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ruff>=0.6.0",
|
||||
"pytest>=8.0.0",
|
||||
"pytest-asyncio>=0.23.0",
|
||||
"httpx>=0.27.0",
|
||||
"reportlab>=4.0.0",
|
||||
"pypdf>=4.3.1",
|
||||
"moto>=5.0.0",
|
||||
]
|
||||
68
tests/conftest.py
Normal file
68
tests/conftest.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Test configuration and fixtures for document-service tests.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from fastapi.testclient import TestClient
|
||||
from unittest.mock import Mock, patch
|
||||
from moto import mock_aws
|
||||
import boto3
|
||||
|
||||
from app.main import app
|
||||
|
||||
# Test data paths
|
||||
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
|
||||
|
||||
@pytest.fixture
|
||||
def test_client():
|
||||
"""Create a test client with auth bypass."""
|
||||
return TestClient(app)
|
||||
|
||||
@pytest.fixture
|
||||
def sample_org_id():
|
||||
"""Sample organization ID for testing."""
|
||||
return "test-org-123"
|
||||
|
||||
@pytest.fixture
|
||||
def sample_document_id():
|
||||
"""Sample document ID for testing."""
|
||||
return "test-doc-456"
|
||||
|
||||
@pytest.fixture
|
||||
def test_pdf_files():
|
||||
"""Paths to test PDF files."""
|
||||
return {
|
||||
"simple_form": os.path.join(FIXTURES_DIR, "simple_form.pdf"),
|
||||
"complex_form": os.path.join(FIXTURES_DIR, "complex_form.pdf"),
|
||||
"no_form": os.path.join(FIXTURES_DIR, "no_form.pdf"),
|
||||
"large_form": os.path.join(FIXTURES_DIR, "large_form.pdf"),
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def mock_s3_client():
|
||||
"""Create a mock S3 client for testing."""
|
||||
with mock_aws():
|
||||
client = boto3.client(
|
||||
"s3",
|
||||
region_name="us-east-1",
|
||||
aws_access_key_id="minioadmin",
|
||||
aws_secret_access_key="minioadmin",
|
||||
)
|
||||
# Create test bucket
|
||||
client.create_bucket(Bucket="document-bucket")
|
||||
yield client
|
||||
|
||||
@pytest.fixture
|
||||
def auth_bypass_middleware():
|
||||
"""Fixture to bypass auth middleware in tests."""
|
||||
def bypass_auth(request):
|
||||
request.state.org_id = "test-org-123"
|
||||
return request
|
||||
|
||||
return bypass_auth
|
||||
|
||||
@pytest.fixture
|
||||
def sample_auth_token():
|
||||
"""Sample auth token for testing."""
|
||||
return "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJvcmdfaWQiOiJ0ZXN0LW9yZy0xMjMifQ.test"
|
||||
304
tests/fixtures/complex_form.pdf
vendored
Normal file
304
tests/fixtures/complex_form.pdf
vendored
Normal file
@@ -0,0 +1,304 @@
|
||||
%PDF-1.3
|
||||
%âãÏÓ
|
||||
1 0 obj
|
||||
<<
|
||||
/Producer (pypdf)
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Count 1
|
||||
/Kids [ 4 0 R ]
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
/AcroForm <<
|
||||
/Fields [ <<
|
||||
/FT /Tx
|
||||
/T (first\137name)
|
||||
/V ()
|
||||
/Rect [ 200 690 400 710 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (last\137name)
|
||||
/V ()
|
||||
/Rect [ 200 640 400 660 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (email)
|
||||
/V ()
|
||||
/Rect [ 200 590 400 610 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (phone)
|
||||
/V ()
|
||||
/Rect [ 200 540 400 560 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (address)
|
||||
/V ()
|
||||
/Rect [ 200 490 400 510 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (city)
|
||||
/V ()
|
||||
/Rect [ 200 440 400 460 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (state)
|
||||
/V ()
|
||||
/Rect [ 200 390 400 410 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (zip\137code)
|
||||
/V ()
|
||||
/Rect [ 200 340 400 360 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Ch
|
||||
/T (country)
|
||||
/V ()
|
||||
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||
/Rect [ 200 290 400 310 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (gender)
|
||||
/V (male)
|
||||
/Rect [ 200 240 220 260 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (gender)
|
||||
/V (female)
|
||||
/Rect [ 300 240 320 260 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (reading)
|
||||
/V /Off
|
||||
/Rect [ 200 190 220 210 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (sports)
|
||||
/V /Off
|
||||
/Rect [ 200 160 220 180 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (music)
|
||||
/V /Off
|
||||
/Rect [ 200 130 220 150 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (travel)
|
||||
/V /Off
|
||||
/Rect [ 200 100 220 120 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (agree\137terms)
|
||||
/V /Off
|
||||
/Rect [ 200 140 220 160 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (signature)
|
||||
/V ()
|
||||
/Rect [ 200 90 400 110 ]
|
||||
/Ff 0
|
||||
>> ]
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 5 0 R
|
||||
/MediaBox [ 0 0 612 792 ]
|
||||
/Resources <<
|
||||
/Font 6 0 R
|
||||
/ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>>
|
||||
/Rotate 0
|
||||
/Trans <<
|
||||
>>
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Annots [ <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (first\137name)
|
||||
/V ()
|
||||
/Rect [ 200 690 400 710 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (last\137name)
|
||||
/V ()
|
||||
/Rect [ 200 640 400 660 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (email)
|
||||
/V ()
|
||||
/Rect [ 200 590 400 610 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (phone)
|
||||
/V ()
|
||||
/Rect [ 200 540 400 560 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (address)
|
||||
/V ()
|
||||
/Rect [ 200 490 400 510 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (city)
|
||||
/V ()
|
||||
/Rect [ 200 440 400 460 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (state)
|
||||
/V ()
|
||||
/Rect [ 200 390 400 410 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (zip\137code)
|
||||
/V ()
|
||||
/Rect [ 200 340 400 360 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Ch
|
||||
/T (country)
|
||||
/V ()
|
||||
/Rect [ 200 290 400 310 ]
|
||||
/Ff 0
|
||||
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (gender)
|
||||
/V (male)
|
||||
/Rect [ 200 240 220 260 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (gender)
|
||||
/V (female)
|
||||
/Rect [ 300 240 320 260 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (reading)
|
||||
/V /Off
|
||||
/Rect [ 200 190 220 210 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (sports)
|
||||
/V /Off
|
||||
/Rect [ 200 160 220 180 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (music)
|
||||
/V /Off
|
||||
/Rect [ 200 130 220 150 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (travel)
|
||||
/V /Off
|
||||
/Rect [ 200 100 220 120 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (agree\137terms)
|
||||
/V /Off
|
||||
/Rect [ 200 140 220 160 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (signature)
|
||||
/V ()
|
||||
/Rect [ 200 90 400 110 ]
|
||||
/Ff 0
|
||||
>> ]
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ]
|
||||
/Length 291
|
||||
>>
|
||||
stream
|
||||
GasbV_+Fea&;KY%MZ9UrC9m8.oN"UdKHc".Gmj%B,>D(A;p`!tWO(4\)'k<]nE'P8R95j8f]2oKJNJY1f"tI,Dm8oIL>-,'An-7/XP_7&hmsPV2$VZlJVuKljga3q-e_fL*;+[hpAoJXWqmrLU,"s52O'g'kTenY-)^6!E]<t>XGGKULRl:>id?'u8b4h!>BX;G^/rC%S5.uq%27\VHe*eP7/%>f=QN:Hc+'*-ihD-.,/'o(;:.X+4s[#!Dq5i9,$f'o&NC;.U."[j3.eA/Se#D\)eRtd.%ou~>
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/F1 7 0 R
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
/Name /F1
|
||||
/Subtype /Type1
|
||||
/Type /Font
|
||||
>>
|
||||
endobj
|
||||
xref
|
||||
0 8
|
||||
0000000000 65535 f
|
||||
0000000015 00000 n
|
||||
0000000054 00000 n
|
||||
0000000113 00000 n
|
||||
0000001378 00000 n
|
||||
0000003056 00000 n
|
||||
0000003438 00000 n
|
||||
0000003469 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 8
|
||||
/Root 3 0 R
|
||||
/Info 1 0 R
|
||||
>>
|
||||
startxref
|
||||
3576
|
||||
%%EOF
|
||||
371
tests/fixtures/generate_test_pdfs.py
vendored
Normal file
371
tests/fixtures/generate_test_pdfs.py
vendored
Normal file
@@ -0,0 +1,371 @@
|
||||
"""
|
||||
Generate test PDF files for document-service testing.
|
||||
|
||||
This script creates various test PDFs with actual AcroForm fields:
|
||||
- Simple form PDF with basic form fields
|
||||
- Complex form PDF with multiple field types
|
||||
- No form PDF without form fields
|
||||
- Large form PDF for size validation testing
|
||||
"""
|
||||
|
||||
import os
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.lib import colors
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
from pypdf.generic import (
|
||||
NameObject,
|
||||
create_string_object,
|
||||
NumberObject,
|
||||
ArrayObject,
|
||||
DictionaryObject,
|
||||
BooleanObject,
|
||||
)
|
||||
|
||||
# Output directory
|
||||
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
def create_simple_form_pdf():
|
||||
"""Create a simple PDF with basic form fields."""
|
||||
output_path = os.path.join(OUTPUT_DIR, "simple_form.pdf")
|
||||
|
||||
# Create base PDF with reportlab
|
||||
c = canvas.Canvas(output_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 16)
|
||||
c.drawString(100, 750, "Simple Form Test")
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
c.drawString(100, 700, "Name:")
|
||||
c.drawString(100, 650, "Email:")
|
||||
c.drawString(100, 600, "Phone:")
|
||||
c.drawString(100, 550, "Country:")
|
||||
c.drawString(100, 500, "Birth Date:")
|
||||
c.drawString(100, 450, "Agree to Terms:")
|
||||
|
||||
c.save()
|
||||
|
||||
# Add actual form fields using pypdf
|
||||
reader = PdfReader(output_path)
|
||||
writer = PdfWriter()
|
||||
|
||||
# Copy the page
|
||||
page = reader.pages[0]
|
||||
writer.add_page(page)
|
||||
|
||||
# Create form fields
|
||||
fields = []
|
||||
|
||||
# Name field (text)
|
||||
name_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Tx"),
|
||||
NameObject("/T"): create_string_object("name"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(690), NumberObject(400), NumberObject(710)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(name_field)
|
||||
|
||||
# Email field (text)
|
||||
email_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Tx"),
|
||||
NameObject("/T"): create_string_object("email"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(640), NumberObject(400), NumberObject(660)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(email_field)
|
||||
|
||||
# Phone field (text)
|
||||
phone_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Tx"),
|
||||
NameObject("/T"): create_string_object("phone"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(590), NumberObject(400), NumberObject(610)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(phone_field)
|
||||
|
||||
# Country field (dropdown/choice)
|
||||
country_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Ch"),
|
||||
NameObject("/T"): create_string_object("country"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Opt"): ArrayObject([
|
||||
create_string_object("USA"),
|
||||
create_string_object("Canada"),
|
||||
create_string_object("UK"),
|
||||
create_string_object("Germany"),
|
||||
create_string_object("France"),
|
||||
]),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(540), NumberObject(400), NumberObject(560)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(country_field)
|
||||
|
||||
# Birth date field (text)
|
||||
birth_date_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Tx"),
|
||||
NameObject("/T"): create_string_object("birth_date"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(490), NumberObject(400), NumberObject(510)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(birth_date_field)
|
||||
|
||||
# Agree terms field (checkbox)
|
||||
agree_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Btn"),
|
||||
NameObject("/T"): create_string_object("agree_terms"),
|
||||
NameObject("/V"): NameObject("/Off"),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(440), NumberObject(220), NumberObject(460)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(agree_field)
|
||||
|
||||
# Add fields to the page
|
||||
page = writer.pages[0]
|
||||
if "/Annots" not in page:
|
||||
page[NameObject("/Annots")] = ArrayObject()
|
||||
|
||||
for field in fields:
|
||||
field_obj = DictionaryObject({
|
||||
NameObject("/Subtype"): NameObject("/Widget"),
|
||||
NameObject("/FT"): field[NameObject("/FT")],
|
||||
NameObject("/T"): field[NameObject("/T")],
|
||||
NameObject("/V"): field.get(NameObject("/V"), NameObject("")),
|
||||
NameObject("/Rect"): field[NameObject("/Rect")],
|
||||
NameObject("/Ff"): field.get(NameObject("/Ff"), NumberObject(0)),
|
||||
})
|
||||
|
||||
if NameObject("/Opt") in field:
|
||||
field_obj[NameObject("/Opt")] = field[NameObject("/Opt")]
|
||||
|
||||
page[NameObject("/Annots")].append(field_obj)
|
||||
|
||||
# Add AcroForm to the document
|
||||
acroform = DictionaryObject({
|
||||
NameObject("/Fields"): ArrayObject(fields),
|
||||
})
|
||||
writer._root_object[NameObject("/AcroForm")] = acroform
|
||||
|
||||
# Save the PDF
|
||||
with open(output_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
print(f"Created: {output_path}")
|
||||
|
||||
def create_complex_form_pdf():
|
||||
"""Create a complex PDF with multiple field types."""
|
||||
output_path = os.path.join(OUTPUT_DIR, "complex_form.pdf")
|
||||
|
||||
# Create base PDF with reportlab
|
||||
c = canvas.Canvas(output_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 16)
|
||||
c.drawString(100, 750, "Complex Form Test")
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
c.drawString(100, 700, "First Name:")
|
||||
c.drawString(100, 650, "Last Name:")
|
||||
c.drawString(100, 600, "Email:")
|
||||
c.drawString(100, 550, "Phone:")
|
||||
c.drawString(100, 500, "Address:")
|
||||
c.drawString(100, 450, "City:")
|
||||
c.drawString(100, 400, "State:")
|
||||
c.drawString(100, 350, "Zip Code:")
|
||||
c.drawString(100, 300, "Country:")
|
||||
c.drawString(100, 250, "Gender:")
|
||||
c.drawString(100, 200, "Interests:")
|
||||
c.drawString(100, 150, "Agree to Terms:")
|
||||
c.drawString(100, 100, "Signature:")
|
||||
|
||||
c.save()
|
||||
|
||||
# Add actual form fields using pypdf
|
||||
reader = PdfReader(output_path)
|
||||
writer = PdfWriter()
|
||||
|
||||
# Copy the page
|
||||
page = reader.pages[0]
|
||||
writer.add_page(page)
|
||||
|
||||
# Create form fields
|
||||
fields = []
|
||||
|
||||
# Text fields
|
||||
text_fields = [
|
||||
('first_name', 200, 690),
|
||||
('last_name', 200, 640),
|
||||
('email', 200, 590),
|
||||
('phone', 200, 540),
|
||||
('address', 200, 490),
|
||||
('city', 200, 440),
|
||||
('state', 200, 390),
|
||||
('zip_code', 200, 340),
|
||||
]
|
||||
|
||||
for name, x, y in text_fields:
|
||||
field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Tx"),
|
||||
NameObject("/T"): create_string_object(name),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(x), NumberObject(y), NumberObject(x + 200), NumberObject(y + 20)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(field)
|
||||
|
||||
# Country dropdown
|
||||
country_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Ch"),
|
||||
NameObject("/T"): create_string_object("country"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Opt"): ArrayObject([
|
||||
create_string_object("USA"),
|
||||
create_string_object("Canada"),
|
||||
create_string_object("UK"),
|
||||
create_string_object("Germany"),
|
||||
create_string_object("France"),
|
||||
]),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(290), NumberObject(400), NumberObject(310)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(country_field)
|
||||
|
||||
# Radio buttons for gender
|
||||
male_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Btn"),
|
||||
NameObject("/T"): create_string_object("gender"),
|
||||
NameObject("/V"): create_string_object("male"),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(240), NumberObject(220), NumberObject(260)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(male_field)
|
||||
|
||||
female_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Btn"),
|
||||
NameObject("/T"): create_string_object("gender"),
|
||||
NameObject("/V"): create_string_object("female"),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(300), NumberObject(240), NumberObject(320), NumberObject(260)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(female_field)
|
||||
|
||||
# Checkboxes for interests
|
||||
interests = ['reading', 'sports', 'music', 'travel']
|
||||
for i, interest in enumerate(interests):
|
||||
field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Btn"),
|
||||
NameObject("/T"): create_string_object(interest),
|
||||
NameObject("/V"): NameObject("/Off"),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(190 - (i * 30)), NumberObject(220), NumberObject(210 - (i * 30))]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(field)
|
||||
|
||||
# Checkbox for agree terms
|
||||
agree_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Btn"),
|
||||
NameObject("/T"): create_string_object("agree_terms"),
|
||||
NameObject("/V"): NameObject("/Off"),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(140), NumberObject(220), NumberObject(160)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(agree_field)
|
||||
|
||||
# Signature field
|
||||
signature_field = DictionaryObject({
|
||||
NameObject("/FT"): NameObject("/Tx"),
|
||||
NameObject("/T"): create_string_object("signature"),
|
||||
NameObject("/V"): create_string_object(""),
|
||||
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(90), NumberObject(400), NumberObject(110)]),
|
||||
NameObject("/Ff"): NumberObject(0),
|
||||
})
|
||||
fields.append(signature_field)
|
||||
|
||||
# Add fields to the page
|
||||
page = writer.pages[0]
|
||||
if "/Annots" not in page:
|
||||
page[NameObject("/Annots")] = ArrayObject()
|
||||
|
||||
for field in fields:
|
||||
field_obj = DictionaryObject({
|
||||
NameObject("/Subtype"): NameObject("/Widget"),
|
||||
NameObject("/FT"): field[NameObject("/FT")],
|
||||
NameObject("/T"): field[NameObject("/T")],
|
||||
NameObject("/V"): field.get(NameObject("/V"), NameObject("")),
|
||||
NameObject("/Rect"): field[NameObject("/Rect")],
|
||||
NameObject("/Ff"): field.get(NameObject("/Ff"), NumberObject(0)),
|
||||
})
|
||||
|
||||
if NameObject("/Opt") in field:
|
||||
field_obj[NameObject("/Opt")] = field[NameObject("/Opt")]
|
||||
|
||||
page[NameObject("/Annots")].append(field_obj)
|
||||
|
||||
# Add AcroForm to the document
|
||||
acroform = DictionaryObject({
|
||||
NameObject("/Fields"): ArrayObject(fields),
|
||||
})
|
||||
writer._root_object[NameObject("/AcroForm")] = acroform
|
||||
|
||||
# Save the PDF
|
||||
with open(output_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
print(f"Created: {output_path}")
|
||||
|
||||
def create_no_form_pdf():
|
||||
"""Create a PDF without form fields."""
|
||||
output_path = os.path.join(OUTPUT_DIR, "no_form.pdf")
|
||||
|
||||
# Create simple PDF without form fields
|
||||
c = canvas.Canvas(output_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 16)
|
||||
c.drawString(100, 750, "No Form Test")
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
c.drawString(100, 700, "This PDF has no form fields.")
|
||||
c.drawString(100, 650, "It is used for testing field discovery")
|
||||
c.drawString(100, 600, "on documents without AcroForm fields.")
|
||||
|
||||
c.save()
|
||||
|
||||
print(f"Created: {output_path}")
|
||||
|
||||
def create_large_form_pdf():
|
||||
"""Create a large PDF for size validation testing."""
|
||||
output_path = os.path.join(OUTPUT_DIR, "large_form.pdf")
|
||||
|
||||
# Create a larger PDF with more content
|
||||
c = canvas.Canvas(output_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 16)
|
||||
c.drawString(100, 750, "Large Form Test")
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
y = 700
|
||||
for i in range(50):
|
||||
c.drawString(100, y, f"Field {i + 1}:")
|
||||
y -= 50
|
||||
if y < 50:
|
||||
c.showPage()
|
||||
y = 700
|
||||
|
||||
c.save()
|
||||
|
||||
print(f"Created: {output_path}")
|
||||
|
||||
def main():
|
||||
"""Generate all test PDF files."""
|
||||
print("Generating test PDF files...")
|
||||
print(f"Output directory: {OUTPUT_DIR}")
|
||||
print()
|
||||
|
||||
create_simple_form_pdf()
|
||||
create_complex_form_pdf()
|
||||
create_no_form_pdf()
|
||||
create_large_form_pdf()
|
||||
|
||||
print()
|
||||
print("All test PDF files generated successfully!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
125
tests/fixtures/large_form.pdf
vendored
Normal file
125
tests/fixtures/large_form.pdf
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Contents 13 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 9 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:19800101000000+00'00') /Creator (anonymous) /Keywords () /ModDate (D:19800101000000+00'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Count 4 /Kids [ 3 0 R 4 0 R 5 0 R 6 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 217
|
||||
>>
|
||||
stream
|
||||
Gas30YmS?5&;9"+:GJ\L`7rI@@Oq[]V;)ju4[h(2dJ$.fMDlYNi/6XZ9/-MBqIFpH"0bWR4+VY?&JE4dmBP4$H`s>o>Pd5_5(knN-9C@@=hbnO$/KG<T]uHC6SHeT%fQ2(61,2)kB&jPeh#ln*V7]`-(1#q7P]TrOr967OBGd6R>k'EA?N"sbgn1*RGt<48$Z/.<iqdC<HBN;BdXTjQboF?~>endstream
|
||||
endobj
|
||||
11 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 179
|
||||
>>
|
||||
stream
|
||||
Gas30^C%h3*5qB\:N<.Pcs3$Hl<(9Sj6mHT",_O,eK?ILEeIs/+25o1W?$HFlO(jerB`1_*amY9`!,>fg-:(O.:HsM<c")brI"e6WCOT4gHTe]6:XPR3Z2,/H>lia7mi26F)k6[R>)2Tc&QO]0JmRQ33#uf(:EGYU/pYb,%W<I+0;`+EW~>endstream
|
||||
endobj
|
||||
12 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 182
|
||||
>>
|
||||
stream
|
||||
Gas30YmS?%'SYMZ:N8jHd+m]ZXcA"(*:Fj!$As93eK>>CO@)QnnF80POP6tcHWu&Bi%Q$",OR8C45u,jFR@u"e5F01DQMJaO6&5D+&?+Z'=%F%qt`rY;O"3#"KbqRMK6*1l<JI#\QT.g>jW9fl6'd&lDQ+4eQPFB=)/[R?*6VZ`^9D([>Kog~>endstream
|
||||
endobj
|
||||
13 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 147
|
||||
>>
|
||||
stream
|
||||
Gas3+3spL'$q8S#<P23]FJ9Y&V4a)bG2NT>h1+`5('Z%;U^2`KE+.t@o*+c<HmDMhfg)&^AATHdpsVmX3RhL!69O]%\U_jUJK0dDLK7_Y[]$?TK6gh*/?5bY6!78.Ms>%mcr*lWqbfg@lpOeX~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 14
|
||||
0000000000 65535 f
|
||||
0000000061 00000 n
|
||||
0000000092 00000 n
|
||||
0000000199 00000 n
|
||||
0000000393 00000 n
|
||||
0000000587 00000 n
|
||||
0000000781 00000 n
|
||||
0000000975 00000 n
|
||||
0000001043 00000 n
|
||||
0000001304 00000 n
|
||||
0000001381 00000 n
|
||||
0000001689 00000 n
|
||||
0000001959 00000 n
|
||||
0000002232 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<30157dc3b9cf65b8d1eaf3493559908e><30157dc3b9cf65b8d1eaf3493559908e>]
|
||||
% ReportLab generated PDF document -- digest (opensource)
|
||||
|
||||
/Info 8 0 R
|
||||
/Root 7 0 R
|
||||
/Size 14
|
||||
>>
|
||||
startxref
|
||||
2470
|
||||
%%EOF
|
||||
68
tests/fixtures/no_form.pdf
vendored
Normal file
68
tests/fixtures/no_form.pdf
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 6 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:19800101000000+00'00') /Creator (anonymous) /Keywords () /ModDate (D:19800101000000+00'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 3 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 226
|
||||
>>
|
||||
stream
|
||||
Gas2Bb6l*?&4Q?lMRuh(2(>rm;UL(=iaR@%P12s;!_o]ip\#oA:h3rL(XCuYYkiVA702`\bERWLTF<pmA'bMe$GLl8m[Gp,mCZM>`irc(:k@<Q,.1t_;U3TSGL0f4RBV`'XKta+*A74'q:3;`A;r@nl60Fm[LVPtD`E'mGib0+5kmB/Rp3p#C+&@HQ1$r/^;:dZ/#koRn*nah\!>!7PW#)X61=m`OB9!~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 8
|
||||
0000000000 65535 f
|
||||
0000000061 00000 n
|
||||
0000000092 00000 n
|
||||
0000000199 00000 n
|
||||
0000000392 00000 n
|
||||
0000000460 00000 n
|
||||
0000000721 00000 n
|
||||
0000000780 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<30157dc3b9cf65b8d1eaf3493559908e><30157dc3b9cf65b8d1eaf3493559908e>]
|
||||
% ReportLab generated PDF document -- digest (opensource)
|
||||
|
||||
/Info 5 0 R
|
||||
/Root 4 0 R
|
||||
/Size 8
|
||||
>>
|
||||
startxref
|
||||
1096
|
||||
%%EOF
|
||||
161
tests/fixtures/simple_form.pdf
vendored
Normal file
161
tests/fixtures/simple_form.pdf
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
%PDF-1.3
|
||||
%âãÏÓ
|
||||
1 0 obj
|
||||
<<
|
||||
/Producer (pypdf)
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Count 1
|
||||
/Kids [ 4 0 R ]
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
/AcroForm <<
|
||||
/Fields [ <<
|
||||
/FT /Tx
|
||||
/T (name)
|
||||
/V ()
|
||||
/Rect [ 200 690 400 710 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (email)
|
||||
/V ()
|
||||
/Rect [ 200 640 400 660 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (phone)
|
||||
/V ()
|
||||
/Rect [ 200 590 400 610 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Ch
|
||||
/T (country)
|
||||
/V ()
|
||||
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||
/Rect [ 200 540 400 560 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Tx
|
||||
/T (birth\137date)
|
||||
/V ()
|
||||
/Rect [ 200 490 400 510 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/FT /Btn
|
||||
/T (agree\137terms)
|
||||
/V /Off
|
||||
/Rect [ 200 440 220 460 ]
|
||||
/Ff 0
|
||||
>> ]
|
||||
>>
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 5 0 R
|
||||
/MediaBox [ 0 0 612 792 ]
|
||||
/Resources <<
|
||||
/Font 6 0 R
|
||||
/ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>>
|
||||
/Rotate 0
|
||||
/Trans <<
|
||||
>>
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Annots [ <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (name)
|
||||
/V ()
|
||||
/Rect [ 200 690 400 710 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (email)
|
||||
/V ()
|
||||
/Rect [ 200 640 400 660 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (phone)
|
||||
/V ()
|
||||
/Rect [ 200 590 400 610 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Ch
|
||||
/T (country)
|
||||
/V ()
|
||||
/Rect [ 200 540 400 560 ]
|
||||
/Ff 0
|
||||
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Tx
|
||||
/T (birth\137date)
|
||||
/V ()
|
||||
/Rect [ 200 490 400 510 ]
|
||||
/Ff 0
|
||||
>> <<
|
||||
/Subtype /Widget
|
||||
/FT /Btn
|
||||
/T (agree\137terms)
|
||||
/V /Off
|
||||
/Rect [ 200 440 220 460 ]
|
||||
/Ff 0
|
||||
>> ]
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ]
|
||||
/Length 214
|
||||
>>
|
||||
stream
|
||||
Gas3/_$YcZ&-h():[oO-KC+O7Fj&337*rSs`0Q/<`k!1:qntBjLh1!*5Q?*5,9cn2L]>4V7T^E=1'1`)j"LZXOAkYndii(Rd4^iHO@!??#S:KhY5-Hn'\Y63F`n8+K,.t]c\@9%516]H[@*&9CT1O*F'1H9T&WS2DLGjN]UaM[f"?B)-YBck(&"KsZ*@fJ2kq(gmZ1he)\4'9")1e>M#~>
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/F1 7 0 R
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
/Name /F1
|
||||
/Subtype /Type1
|
||||
/Type /Font
|
||||
>>
|
||||
endobj
|
||||
xref
|
||||
0 8
|
||||
0000000000 65535 f
|
||||
0000000015 00000 n
|
||||
0000000054 00000 n
|
||||
0000000113 00000 n
|
||||
0000000637 00000 n
|
||||
0000001387 00000 n
|
||||
0000001692 00000 n
|
||||
0000001723 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 8
|
||||
/Root 3 0 R
|
||||
/Info 1 0 R
|
||||
>>
|
||||
startxref
|
||||
1830
|
||||
%%EOF
|
||||
464
tests/test_documents.py
Normal file
464
tests/test_documents.py
Normal file
@@ -0,0 +1,464 @@
|
||||
"""
|
||||
Comprehensive test suite for document-service.
|
||||
|
||||
Tests document upload, retrieval, field discovery, and complete workflows.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from fastapi.testclient import TestClient
|
||||
from unittest.mock import Mock, patch
|
||||
from moto import mock_aws
|
||||
import boto3
|
||||
|
||||
from app.main import app
|
||||
from app.pdf import discover_fields
|
||||
|
||||
# Test data paths
|
||||
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
|
||||
|
||||
|
||||
class TestHealthEndpoint:
|
||||
"""Test health endpoint functionality."""
|
||||
|
||||
def test_health_endpoint(self, test_client):
|
||||
"""Test health endpoint returns 200 OK."""
|
||||
response = test_client.get("/health")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {"status": "ok"}
|
||||
|
||||
|
||||
class TestDocumentUpload:
|
||||
"""Test document upload functionality."""
|
||||
|
||||
def test_upload_simple_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test uploading a simple PDF with form fields."""
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
result = response.json()
|
||||
assert "document_id" in result
|
||||
assert "metadata" in result
|
||||
assert "download_url" in result
|
||||
assert result["metadata"]["document_type"] == "pdf"
|
||||
assert result["metadata"]["filename"] == "simple_form.pdf"
|
||||
|
||||
def test_upload_complex_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test uploading a complex PDF with multiple field types."""
|
||||
with open(test_pdf_files["complex_form"], "rb") as f:
|
||||
files = {"file": ("complex_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
result = response.json()
|
||||
assert "document_id" in result
|
||||
assert result["metadata"]["document_type"] == "pdf"
|
||||
|
||||
def test_upload_no_form_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test uploading a PDF without form fields."""
|
||||
with open(test_pdf_files["no_form"], "rb") as f:
|
||||
files = {"file": ("no_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
result = response.json()
|
||||
assert "document_id" in result
|
||||
|
||||
def test_upload_without_auth_returns_401(self, test_client, test_pdf_files):
|
||||
"""Test upload without auth returns 401."""
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data
|
||||
)
|
||||
|
||||
assert response.status_code == 401
|
||||
assert "detail" in response.json()
|
||||
|
||||
def test_upload_with_invalid_auth_returns_401(self, test_client, test_pdf_files):
|
||||
"""Test upload with invalid auth returns 401."""
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": "Invalid token"}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
def test_upload_missing_file_returns_400(self, test_client, sample_auth_token):
|
||||
"""Test upload without file returns 400."""
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 422 # FastAPI validation error
|
||||
|
||||
|
||||
class TestDocumentMetadata:
|
||||
"""Test document metadata retrieval."""
|
||||
|
||||
def test_get_document_metadata_success(self, test_client, sample_auth_token):
|
||||
"""Test getting document metadata successfully."""
|
||||
# This test would require a document to be uploaded first
|
||||
# For now, we'll test the endpoint structure
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.get(
|
||||
"/api/documents/test-doc-456",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||
assert response.status_code in [404, 403]
|
||||
|
||||
def test_get_document_without_auth_returns_401(self, test_client):
|
||||
"""Test getting document without auth returns 401."""
|
||||
response = test_client.get("/api/documents/test-doc-456")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestDownloadUrl:
|
||||
"""Test download URL generation."""
|
||||
|
||||
def test_get_download_url_success(self, test_client, sample_auth_token):
|
||||
"""Test getting download URL successfully."""
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.get(
|
||||
"/api/documents/test-doc-456/download-url",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||
assert response.status_code in [404, 403]
|
||||
|
||||
def test_get_download_url_without_auth_returns_401(self, test_client):
|
||||
"""Test getting download URL without auth returns 401."""
|
||||
response = test_client.get("/api/documents/test-doc-456/download-url")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestPDFFieldDiscovery:
|
||||
"""Test PDF field discovery functionality."""
|
||||
|
||||
def test_get_pdf_fields_simple_form(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test getting PDF fields from simple form."""
|
||||
# First upload the document
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get fields
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
assert "fields" in result
|
||||
assert len(result["fields"]) == 6 # name, email, phone, country, birth_date, agree_terms
|
||||
|
||||
# Check field types
|
||||
field_types = {f["field"]: f["type"] for f in result["fields"]}
|
||||
assert field_types["name"] == "string"
|
||||
assert field_types["email"] == "string"
|
||||
assert field_types["phone"] == "string"
|
||||
assert field_types["country"] == "select"
|
||||
assert field_types["birth_date"] == "date"
|
||||
assert field_types["agree_terms"] == "boolean"
|
||||
|
||||
def test_get_pdf_fields_complex_form(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test getting PDF fields from complex form."""
|
||||
# First upload the document
|
||||
with open(test_pdf_files["complex_form"], "rb") as f:
|
||||
files = {"file": ("complex_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get fields
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
assert "fields" in result
|
||||
assert len(result["fields"]) == 16 # All fields from complex form
|
||||
|
||||
def test_get_pdf_fields_no_form_returns_empty_list(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test getting PDF fields from PDF without form fields."""
|
||||
# First upload the document
|
||||
with open(test_pdf_files["no_form"], "rb") as f:
|
||||
files = {"file": ("no_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get fields
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
assert "fields" in result
|
||||
assert len(result["fields"]) == 0
|
||||
|
||||
def test_get_pdf_fields_without_auth_returns_401(self, test_client):
|
||||
"""Test getting PDF fields without auth returns 401."""
|
||||
response = test_client.get("/api/documents/test-doc-456/fields")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestDocumentDeletion:
|
||||
"""Test document deletion functionality."""
|
||||
|
||||
def test_delete_document_success(self, test_client, sample_auth_token):
|
||||
"""Test deleting document successfully."""
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.delete(
|
||||
"/api/documents/test-doc-456",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||
assert response.status_code in [404, 403]
|
||||
|
||||
def test_delete_document_without_auth_returns_401(self, test_client):
|
||||
"""Test deleting document without auth returns 401."""
|
||||
response = test_client.delete("/api/documents/test-doc-456")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestPDFFieldDiscoveryDirect:
|
||||
"""Test PDF field discovery directly (without API)."""
|
||||
|
||||
def test_simple_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on simple form PDF."""
|
||||
fields = discover_fields(test_pdf_files["simple_form"])
|
||||
|
||||
assert len(fields) == 6
|
||||
field_names = [f["field"] for f in fields]
|
||||
assert "name" in field_names
|
||||
assert "email" in field_names
|
||||
assert "phone" in field_names
|
||||
assert "country" in field_names
|
||||
assert "birth_date" in field_names
|
||||
assert "agree_terms" in field_names
|
||||
|
||||
# Check field types
|
||||
field_types = {f["field"]: f["type"] for f in fields}
|
||||
assert field_types["name"] == "string"
|
||||
assert field_types["email"] == "string"
|
||||
assert field_types["phone"] == "string"
|
||||
assert field_types["country"] == "select"
|
||||
assert field_types["birth_date"] == "date"
|
||||
assert field_types["agree_terms"] == "boolean"
|
||||
|
||||
def test_complex_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on complex form PDF."""
|
||||
fields = discover_fields(test_pdf_files["complex_form"])
|
||||
|
||||
assert len(fields) == 16
|
||||
field_names = [f["field"] for f in fields]
|
||||
|
||||
# Check for expected fields
|
||||
assert "first_name" in field_names
|
||||
assert "last_name" in field_names
|
||||
assert "email" in field_names
|
||||
assert "country" in field_names
|
||||
assert "gender" in field_names
|
||||
assert "agree_terms" in field_names
|
||||
assert "signature" in field_names
|
||||
|
||||
# Check field types
|
||||
field_types = {f["field"]: f["type"] for f in fields}
|
||||
assert field_types["first_name"] == "string"
|
||||
assert field_types["country"] == "select"
|
||||
assert field_types["gender"] == "boolean"
|
||||
assert field_types["agree_terms"] == "boolean"
|
||||
assert field_types["signature"] == "string"
|
||||
|
||||
def test_no_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on PDF without form fields."""
|
||||
fields = discover_fields(test_pdf_files["no_form"])
|
||||
|
||||
assert len(fields) == 0
|
||||
|
||||
def test_large_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on large PDF without form fields."""
|
||||
fields = discover_fields(test_pdf_files["large_form"])
|
||||
|
||||
assert len(fields) == 0
|
||||
|
||||
def test_pdf_field_labels_generated_correctly(self, test_pdf_files):
|
||||
"""Test that field labels are generated correctly."""
|
||||
fields = discover_fields(test_pdf_files["simple_form"])
|
||||
|
||||
field_labels = {f["field"]: f["label"] for f in fields}
|
||||
assert field_labels["name"] == "Name"
|
||||
assert field_labels["email"] == "Email"
|
||||
assert field_labels["phone"] == "Phone"
|
||||
assert field_labels["country"] == "Country"
|
||||
assert field_labels["birth_date"] == "Birth Date"
|
||||
assert field_labels["agree_terms"] == "Agree Terms"
|
||||
|
||||
def test_pdf_field_options_extracted_correctly(self, test_pdf_files):
|
||||
"""Test that dropdown options are extracted correctly."""
|
||||
fields = discover_fields(test_pdf_files["simple_form"])
|
||||
|
||||
country_field = next(f for f in fields if f["field"] == "country")
|
||||
assert country_field["type"] == "select"
|
||||
assert country_field["options"] is not None
|
||||
assert len(country_field["options"]) == 5
|
||||
assert "USA" in country_field["options"]
|
||||
assert "Canada" in country_field["options"]
|
||||
assert "UK" in country_field["options"]
|
||||
assert "Germany" in country_field["options"]
|
||||
assert "France" in country_field["options"]
|
||||
|
||||
|
||||
class TestCompleteWorkflow:
|
||||
"""Test complete document lifecycle workflows."""
|
||||
|
||||
def test_complete_document_lifecycle(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test complete document lifecycle: upload, get metadata, get fields, delete."""
|
||||
# Upload document
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get metadata
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
metadata_response = test_client.get(
|
||||
f"/api/documents/{document_id}",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Get fields
|
||||
fields_response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Get download URL
|
||||
download_response = test_client.get(
|
||||
f"/api/documents/{document_id}/download-url",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Delete document
|
||||
delete_response = test_client.delete(
|
||||
f"/api/documents/{document_id}",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Verify all operations succeeded
|
||||
assert upload_response.status_code == 201
|
||||
assert metadata_response.status_code in [200, 404] # May be 404 if S3 not available
|
||||
assert fields_response.status_code in [200, 404]
|
||||
assert download_response.status_code in [200, 404]
|
||||
assert delete_response.status_code in [200, 404]
|
||||
Reference in New Issue
Block a user