Initial commit of document-service
This commit is contained in:
68
.gitea/workflows/build-and-publish.yaml
Normal file
68
.gitea/workflows/build-and-publish.yaml
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
name: Build and Publish
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
env:
|
||||||
|
CHART_NAME: ${{ github.event.repository.name }}
|
||||||
|
IMAGE_NAME: ${{ github.event.repository.name }}
|
||||||
|
jobs:
|
||||||
|
build-release:
|
||||||
|
runs-on: nix
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Build Docker Image via Nix Flake
|
||||||
|
run: |
|
||||||
|
nix build .#dockerImage --print-build-logs
|
||||||
|
docker load < result
|
||||||
|
|
||||||
|
- name: Log in to Gitea Container Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ github.server_url }}
|
||||||
|
username: ${{ secrets.CI_USER }}
|
||||||
|
password: ${{ secrets.CI_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Tag and Push Docker Image
|
||||||
|
run: |
|
||||||
|
VERSION=${{ github.run_number }}
|
||||||
|
|
||||||
|
# Strip https from server URL
|
||||||
|
REGISTRY=${GITHUB_SERVER_URL#https://}
|
||||||
|
|
||||||
|
TARGET_IMAGE=$REGISTRY/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}
|
||||||
|
|
||||||
|
# Auto-detect the built image name (better version)
|
||||||
|
SOURCE_IMAGE=$(docker load < result | awk '{print $3}')
|
||||||
|
|
||||||
|
docker tag $SOURCE_IMAGE $TARGET_IMAGE:$VERSION
|
||||||
|
docker tag $SOURCE_IMAGE $TARGET_IMAGE:latest
|
||||||
|
docker push $TARGET_IMAGE:$VERSION
|
||||||
|
docker push $TARGET_IMAGE:latest
|
||||||
|
|
||||||
|
- name: Setup Helm
|
||||||
|
uses: azure/setup-helm@v4
|
||||||
|
with:
|
||||||
|
version: v3.14.0
|
||||||
|
|
||||||
|
- name: Package Helm Chart
|
||||||
|
run: |
|
||||||
|
VERSION=${{ github.run_number }}
|
||||||
|
helm repo add bjw-s https://bjw-s-labs.github.io/helm-charts
|
||||||
|
helm dependency build ops/chart
|
||||||
|
helm package ops/chart --version $VERSION --app-version $VERSION
|
||||||
|
|
||||||
|
- name: Push Helm Chart to Gitea Registry
|
||||||
|
run: |
|
||||||
|
VERSION=${{ github.run_number }}
|
||||||
|
CHART_FILE=${{ env.CHART_NAME }}-${VERSION}.tgz
|
||||||
|
|
||||||
|
curl -f --user "${{ secrets.CI_USER }}:${{ secrets.CI_PASSWORD }}" \
|
||||||
|
-X POST \
|
||||||
|
--upload-file ./$CHART_FILE \
|
||||||
|
"${{ github.server_url }}/api/packages/${{ github.repository_owner }}/helm/api/charts"
|
||||||
54
.gitignore
vendored
Normal file
54
.gitignore
vendored
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
.env.local
|
||||||
|
.env.*.local
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
|
||||||
|
# Nix
|
||||||
|
.direnv/
|
||||||
|
result
|
||||||
286
README.md
Normal file
286
README.md
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
# Document Service
|
||||||
|
|
||||||
|
Generic document management service with S3 storage and PDF field discovery.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Multi-format support**: PDF, DOCX, XLSX, JPG, JPEG, PNG, GIF
|
||||||
|
- **S3 storage**: Configurable S3-compatible storage (MinIO, AWS S3, etc.)
|
||||||
|
- **PDF field discovery**: Extract form fields from PDF documents
|
||||||
|
- **Organization-based access control**: Documents scoped to organizations
|
||||||
|
- **File size limits**: Configurable per document type
|
||||||
|
- **Content type detection**: Automatic detection using python-magic
|
||||||
|
- **Comprehensive logging**: All operations logged for audit trail
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Upload Document
|
||||||
|
```
|
||||||
|
POST /api/documents/upload
|
||||||
|
Content-Type: multipart/form-data
|
||||||
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
|
Form data:
|
||||||
|
- file: (required) Document file
|
||||||
|
- uploaded_by: (optional) User who uploaded the document
|
||||||
|
|
||||||
|
Response:
|
||||||
|
{
|
||||||
|
"document_id": "uuid",
|
||||||
|
"metadata": {...},
|
||||||
|
"download_url": "presigned-url"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rewrite Document
|
||||||
|
```
|
||||||
|
PUT /api/documents/{document_id}
|
||||||
|
Content-Type: multipart/form-data
|
||||||
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
|
Form data:
|
||||||
|
- file: (required) New document file
|
||||||
|
- uploaded_by: (optional) User who uploaded the document
|
||||||
|
|
||||||
|
Response:
|
||||||
|
{
|
||||||
|
"document_id": "uuid",
|
||||||
|
"metadata": {...},
|
||||||
|
"download_url": "presigned-url"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Document Metadata
|
||||||
|
```
|
||||||
|
GET /api/documents/{document_id}
|
||||||
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
|
Response:
|
||||||
|
{
|
||||||
|
"document_id": "uuid",
|
||||||
|
"org_id": "org-id",
|
||||||
|
"uploaded_by": "user",
|
||||||
|
"document_type": "pdf",
|
||||||
|
"filename": "document.pdf",
|
||||||
|
"content_type": "application/pdf",
|
||||||
|
"file_size": 12345,
|
||||||
|
"s3_key": "documents/org-id/uuid/document.pdf",
|
||||||
|
"created_at": "2024-01-01T00:00:00",
|
||||||
|
"updated_at": "2024-01-01T00:00:00"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Download URL
|
||||||
|
```
|
||||||
|
GET /api/documents/{document_id}/download-url?expires_in=3600
|
||||||
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
|
Response:
|
||||||
|
{
|
||||||
|
"download_url": "presigned-url",
|
||||||
|
"s3_key": "documents/org-id/uuid/document.pdf",
|
||||||
|
"expires_in": 3600
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get PDF Fields
|
||||||
|
```
|
||||||
|
GET /api/documents/{document_id}/fields
|
||||||
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
|
Response:
|
||||||
|
{
|
||||||
|
"document_id": "uuid",
|
||||||
|
"document_type": "pdf",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "field_name",
|
||||||
|
"label": "Field Name",
|
||||||
|
"type": "string",
|
||||||
|
"required": false,
|
||||||
|
"options": null
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Delete Document
|
||||||
|
```
|
||||||
|
DELETE /api/documents/{document_id}
|
||||||
|
Authorization: Bearer <token>
|
||||||
|
|
||||||
|
Response:
|
||||||
|
{
|
||||||
|
"message": "Document deleted successfully"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
| Variable | Description | Default |
|
||||||
|
|----------|-------------|---------|
|
||||||
|
| `S3_ENDPOINT` | S3 endpoint URL | `http://localhost:9000` |
|
||||||
|
| `S3_ACCESS_KEY` | S3 access key | `minioadmin` |
|
||||||
|
| `S3_SECRET_KEY` | S3 secret key | `minioadmin` |
|
||||||
|
| `S3_BUCKET` | S3 bucket name | `document-bucket` |
|
||||||
|
| `S3_REGION` | S3 region | `us-east-1` |
|
||||||
|
| `HOST` | Service host | `0.0.0.0` |
|
||||||
|
| `PORT` | Service port | `8082` |
|
||||||
|
| `TEST_UPLOADER` | Default uploader for testing | `test-user` |
|
||||||
|
| `LOG_LEVEL` | Logging level | `INFO` |
|
||||||
|
|
||||||
|
### File Size Limits
|
||||||
|
|
||||||
|
| Document Type | Default Limit |
|
||||||
|
|---------------|---------------|
|
||||||
|
| PDF | 50MB |
|
||||||
|
| DOCX | 25MB |
|
||||||
|
| XLSX | 25MB |
|
||||||
|
| JPG/JPEG | 10MB |
|
||||||
|
| PNG | 10MB |
|
||||||
|
| GIF | 10MB |
|
||||||
|
| Other | 10MB |
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
The service uses JWT tokens for authentication. The `org_id` is extracted from the token claims and used for organization-based access control.
|
||||||
|
|
||||||
|
**Note**: Currently, the auth middleware includes a mock implementation for testing. In production, this should be replaced with proper Zitadel integration.
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
This project uses [uv2nix](https://pyproject-nix.github.io/uv2nix/) for reproducible Python dependency management with Nix.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enter the development shell (uses uv2nix)
|
||||||
|
nix develop
|
||||||
|
|
||||||
|
# The development shell includes:
|
||||||
|
# - Python with all dependencies from uv.lock
|
||||||
|
# - uv tool for package management
|
||||||
|
# - pyright for type checking
|
||||||
|
# - file package (provides libmagic for content type detection)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running the Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start the development server
|
||||||
|
uvicorn app.main:app --reload --host 0.0.0.0 --port 8082
|
||||||
|
|
||||||
|
# Access API documentation
|
||||||
|
open http://localhost:8082/docs
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adding Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Add a new dependency
|
||||||
|
uv add <package-name>
|
||||||
|
|
||||||
|
# Add a development dependency
|
||||||
|
uv add --dev <package-name>
|
||||||
|
|
||||||
|
# Update the lock file
|
||||||
|
uv lock
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests
|
||||||
|
pytest
|
||||||
|
|
||||||
|
# Run with coverage
|
||||||
|
pytest --cov=app
|
||||||
|
```
|
||||||
|
|
||||||
|
### Linting
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run ruff
|
||||||
|
ruff check app/
|
||||||
|
|
||||||
|
# Format code
|
||||||
|
ruff format app/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building Production Package
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build the production package
|
||||||
|
nix build
|
||||||
|
|
||||||
|
# The package will be available at ./result
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
### Using Helm
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install chart
|
||||||
|
helm install document-service ./ops/chart
|
||||||
|
|
||||||
|
# Upgrade chart
|
||||||
|
helm upgrade document-service ./ops/chart
|
||||||
|
|
||||||
|
# Uninstall
|
||||||
|
helm uninstall document-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
Edit `ops/chart/values.yaml` to customize deployment settings.
|
||||||
|
|
||||||
|
## S3 Path Structure
|
||||||
|
|
||||||
|
Documents are stored in S3 using the following path structure:
|
||||||
|
|
||||||
|
```
|
||||||
|
documents/{org_id}/{document_id}/{filename}
|
||||||
|
```
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```
|
||||||
|
documents/org-123/abc-456-def-789/policy_document.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Logging
|
||||||
|
|
||||||
|
All operations are logged with the following information:
|
||||||
|
- Operation type (upload, download, delete, etc.)
|
||||||
|
- Document ID
|
||||||
|
- Organization ID
|
||||||
|
- User ID
|
||||||
|
- Timestamp
|
||||||
|
- Success/failure status
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The service returns appropriate HTTP status codes:
|
||||||
|
|
||||||
|
- `200` - Success
|
||||||
|
- `201` - Created
|
||||||
|
- `400` - Bad Request
|
||||||
|
- `401` - Unauthorized
|
||||||
|
- `403` - Forbidden
|
||||||
|
- `404` - Not Found
|
||||||
|
- `413` - Payload Too Large (file size exceeded)
|
||||||
|
- `415` - Unsupported Media Type
|
||||||
|
- `500` - Internal Server Error
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- [ ] Implement proper Zitadel authentication
|
||||||
|
- [ ] Add document listing endpoint
|
||||||
|
- [ ] Add document search functionality
|
||||||
|
- [ ] Add document versioning support
|
||||||
|
- [ ] Add document conversion capabilities
|
||||||
|
- [ ] Add comprehensive test coverage
|
||||||
|
- [ ] Add API rate limiting
|
||||||
|
- [ ] Add metrics and monitoring
|
||||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
31
app/config.py
Normal file
31
app/config.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
# S3 settings
|
||||||
|
s3_endpoint: str = "http://localhost:9000"
|
||||||
|
s3_access_key: str = "minioadmin"
|
||||||
|
s3_secret_key: str = "minioadmin"
|
||||||
|
s3_bucket: str = "document-bucket"
|
||||||
|
s3_region: str = "us-east-1"
|
||||||
|
|
||||||
|
# Service settings
|
||||||
|
host: str = "0.0.0.0"
|
||||||
|
port: int = 8082
|
||||||
|
|
||||||
|
# File size limits (bytes)
|
||||||
|
max_file_size_pdf: int = 50 * 1024 * 1024 # 50MB
|
||||||
|
max_file_size_docx: int = 25 * 1024 * 1024 # 25MB
|
||||||
|
max_file_size_xlsx: int = 25 * 1024 * 1024 # 25MB
|
||||||
|
max_file_size_jpg: int = 10 * 1024 * 1024 # 10MB
|
||||||
|
max_file_size_jpeg: int = 10 * 1024 * 1024 # 10MB
|
||||||
|
max_file_size_png: int = 10 * 1024 * 1024 # 10MB
|
||||||
|
max_file_size_gif: int = 10 * 1024 * 1024 # 10MB
|
||||||
|
max_file_size_default: int = 10 * 1024 * 1024 # 10MB
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
log_level: str = "INFO"
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
env_file = ".env"
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
38
app/enums.py
Normal file
38
app/enums.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
class DocumentType(str, Enum):
|
||||||
|
PDF = "pdf"
|
||||||
|
DOCX = "docx"
|
||||||
|
XLSX = "xlsx"
|
||||||
|
JPG = "jpg"
|
||||||
|
JPEG = "jpeg"
|
||||||
|
PNG = "png"
|
||||||
|
GIF = "gif"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_mime_type(cls, mime_type: str) -> "DocumentType":
|
||||||
|
"""Map MIME type to DocumentType"""
|
||||||
|
mapping = {
|
||||||
|
"application/pdf": cls.PDF,
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": cls.DOCX,
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": cls.XLSX,
|
||||||
|
"image/jpeg": cls.JPG,
|
||||||
|
"image/png": cls.PNG,
|
||||||
|
"image/gif": cls.GIF,
|
||||||
|
}
|
||||||
|
return mapping.get(mime_type.lower())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_extension(cls, filename: str) -> "DocumentType":
|
||||||
|
"""Map file extension to DocumentType"""
|
||||||
|
ext = filename.split(".")[-1].lower()
|
||||||
|
mapping = {
|
||||||
|
"pdf": cls.PDF,
|
||||||
|
"docx": cls.DOCX,
|
||||||
|
"xlsx": cls.XLSX,
|
||||||
|
"jpg": cls.JPG,
|
||||||
|
"jpeg": cls.JPEG,
|
||||||
|
"png": cls.PNG,
|
||||||
|
"gif": cls.GIF,
|
||||||
|
}
|
||||||
|
return mapping.get(ext)
|
||||||
13
app/logger.py
Normal file
13
app/logger.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import logging
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
def setup_logging():
|
||||||
|
"""Setup logging configuration"""
|
||||||
|
logging.basicConfig(
|
||||||
|
level=getattr(logging, settings.log_level.upper()),
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_logger(name: str) -> logging.Logger:
|
||||||
|
"""Get logger with specified name"""
|
||||||
|
return logging.getLogger(name)
|
||||||
82
app/main.py
Normal file
82
app/main.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.openapi.utils import get_openapi
|
||||||
|
from app.routers import documents
|
||||||
|
from app.config import settings
|
||||||
|
from app.logger import setup_logging
|
||||||
|
from app.middleware.auth import AuthMiddleware
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
setup_logging()
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Document Service",
|
||||||
|
version="1.0.0",
|
||||||
|
description="Generic document management service with S3 storage and PDF field discovery",
|
||||||
|
openapi_url="/openapi3.json",
|
||||||
|
docs_url="/docs",
|
||||||
|
redoc_url="/redoc"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add auth middleware
|
||||||
|
app.add_middleware(AuthMiddleware)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["http://localhost:3000"],
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"]
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(documents.router)
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup_event():
|
||||||
|
"""Run startup tasks.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If S3 bucket initialization fails (service will fail to start)
|
||||||
|
"""
|
||||||
|
from app import s3
|
||||||
|
from app.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
logger.info("Starting up document service...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
s3.ensure_bucket_exists()
|
||||||
|
logger.info("S3 bucket initialization complete")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize S3 bucket: {e}")
|
||||||
|
# Re-raise to fail startup
|
||||||
|
raise
|
||||||
|
|
||||||
|
@app.get("/health", tags=["health"])
|
||||||
|
def health():
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
@app.get("/health/ready", tags=["health"])
|
||||||
|
def health_ready():
|
||||||
|
"""Health check for Kubernetes readiness probes."""
|
||||||
|
return {"status": "ready"}
|
||||||
|
|
||||||
|
def custom_openapi():
|
||||||
|
if app.openapi_schema:
|
||||||
|
return app.openapi_schema
|
||||||
|
|
||||||
|
schema = get_openapi(
|
||||||
|
title="Document Service",
|
||||||
|
version="1.0.0",
|
||||||
|
openapi_version="3.1.0",
|
||||||
|
description="Generic document management service with S3 storage and PDF field discovery",
|
||||||
|
routes=app.routes
|
||||||
|
)
|
||||||
|
|
||||||
|
schema["servers"] = [
|
||||||
|
{"url": "http://localhost:8082", "description": "Local dev"}
|
||||||
|
]
|
||||||
|
|
||||||
|
app.openapi_schema = schema
|
||||||
|
return app.openapi_schema
|
||||||
|
|
||||||
|
app.openapi = custom_openapi
|
||||||
0
app/middleware/__init__.py
Normal file
0
app/middleware/__init__.py
Normal file
16
app/middleware/auth.py
Normal file
16
app/middleware/auth.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from fastapi import Request
|
||||||
|
from starlette.middleware.base import BaseHTTPMiddleware
|
||||||
|
from starlette.responses import JSONResponse
|
||||||
|
from app.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
class AuthMiddleware(BaseHTTPMiddleware):
|
||||||
|
async def dispatch(self, request: Request, call_next):
|
||||||
|
# Skip auth for health endpoint
|
||||||
|
if request.url.path == "/health":
|
||||||
|
return await call_next(request)
|
||||||
|
request.state.org_id = "test"
|
||||||
|
response = await call_next(request)
|
||||||
|
return response
|
||||||
|
|
||||||
30
app/models.py
Normal file
30
app/models.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
from app.enums import DocumentType
|
||||||
|
|
||||||
|
class DocumentMetadata(BaseModel):
|
||||||
|
document_id: str = Field(..., description="UUID of the document")
|
||||||
|
org_id: str = Field(..., description="Organization ID")
|
||||||
|
document_type: DocumentType = Field(..., description="Type of document")
|
||||||
|
filename: str = Field(..., description="Original filename")
|
||||||
|
content_type: str = Field(..., description="MIME type")
|
||||||
|
file_size: int = Field(..., description="File size in bytes")
|
||||||
|
s3_key: str = Field(..., description="S3 key for the document")
|
||||||
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
|
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
|
|
||||||
|
class UploadResponse(BaseModel):
|
||||||
|
document_id: str
|
||||||
|
metadata: DocumentMetadata
|
||||||
|
download_url: str
|
||||||
|
|
||||||
|
class DownloadUrlResponse(BaseModel):
|
||||||
|
download_url: str
|
||||||
|
s3_key: str
|
||||||
|
expires_in: int
|
||||||
|
|
||||||
|
class FieldsResponse(BaseModel):
|
||||||
|
document_id: str
|
||||||
|
document_type: DocumentType
|
||||||
|
fields: list[dict]
|
||||||
105
app/pdf.py
Normal file
105
app/pdf.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
import os
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
def discover_fields(pdf_path: str) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Introspect a PDF and return all fillable AcroForm fields.
|
||||||
|
Handles any form of AcroForm structure.
|
||||||
|
"""
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
|
||||||
|
# Try multiple methods to get fields
|
||||||
|
fields = None
|
||||||
|
|
||||||
|
# Method 1: Try get_fields() first
|
||||||
|
try:
|
||||||
|
fields = reader.get_fields()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"get_fields() failed: {e}")
|
||||||
|
fields = None
|
||||||
|
|
||||||
|
# Method 2: Try to get fields from AcroForm directly
|
||||||
|
if not fields:
|
||||||
|
try:
|
||||||
|
if "/AcroForm" in reader.trailer["/Root"]:
|
||||||
|
acroform = reader.trailer["/Root"]["/AcroForm"]
|
||||||
|
if "/Fields" in acroform:
|
||||||
|
fields = {}
|
||||||
|
field_array = acroform["/Fields"]
|
||||||
|
for field_ref in field_array:
|
||||||
|
try:
|
||||||
|
field_obj = field_ref.get_object()
|
||||||
|
field_name = field_obj.get("/T", "")
|
||||||
|
if field_name:
|
||||||
|
fields[field_name] = field_obj
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing field: {e}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Direct AcroForm access failed: {e}")
|
||||||
|
fields = None
|
||||||
|
|
||||||
|
# Method 3: Try to get fields from page annotations
|
||||||
|
if not fields:
|
||||||
|
try:
|
||||||
|
fields = {}
|
||||||
|
for page in reader.pages:
|
||||||
|
if "/Annots" in page:
|
||||||
|
for annot in page["/Annots"]:
|
||||||
|
try:
|
||||||
|
annot_obj = annot.get_object()
|
||||||
|
if "/Subtype" in annot_obj and annot_obj["/Subtype"] == "/Widget":
|
||||||
|
field_name = annot_obj.get("/T", "")
|
||||||
|
if field_name and field_name not in fields:
|
||||||
|
fields[field_name] = annot_obj
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing annotation: {e}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Page annotation access failed: {e}")
|
||||||
|
fields = None
|
||||||
|
|
||||||
|
if not fields:
|
||||||
|
return []
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for field_name, field_obj in fields.items():
|
||||||
|
try:
|
||||||
|
field_type = field_obj.get("/FT", "")
|
||||||
|
options = []
|
||||||
|
|
||||||
|
# /Ch = choice field (select/dropdown)
|
||||||
|
if field_type == "/Ch":
|
||||||
|
opt = field_obj.get("/Opt", [])
|
||||||
|
if opt:
|
||||||
|
options = [o if isinstance(o, str) else o[1] for o in opt]
|
||||||
|
|
||||||
|
result.append({
|
||||||
|
"field": field_name,
|
||||||
|
"label": field_name.replace("_", " ").title(),
|
||||||
|
"type": _map_field_type(field_type, field_obj),
|
||||||
|
"required": False,
|
||||||
|
"options": options if options else None
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing field {field_name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _map_field_type(ft: str, field_obj: dict) -> str:
|
||||||
|
mapping = {
|
||||||
|
"/Tx": "string",
|
||||||
|
"/Btn": "boolean",
|
||||||
|
"/Ch": "select",
|
||||||
|
"/Sig": "string"
|
||||||
|
}
|
||||||
|
base = mapping.get(ft, "string")
|
||||||
|
|
||||||
|
# Check if it's a date field by name hint
|
||||||
|
field_name = field_obj.get("/T", "").lower()
|
||||||
|
if any(hint in field_name for hint in ["date", "fecha", "birth", "nacimiento"]):
|
||||||
|
return "date"
|
||||||
|
|
||||||
|
return base
|
||||||
1
app/routers/__init__.py
Normal file
1
app/routers/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from app.routers import documents
|
||||||
355
app/routers/documents.py
Normal file
355
app/routers/documents.py
Normal file
@@ -0,0 +1,355 @@
|
|||||||
|
import os
|
||||||
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Request
|
||||||
|
from typing import Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app import s3, pdf, utils
|
||||||
|
from app.config import settings
|
||||||
|
from app.enums import DocumentType
|
||||||
|
from app.models import DocumentMetadata, UploadResponse, DownloadUrlResponse, FieldsResponse
|
||||||
|
from app.logger import get_logger
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
@router.post("/upload", response_model=UploadResponse)
|
||||||
|
async def upload_document(
|
||||||
|
request: Request,
|
||||||
|
file: UploadFile = File(...)
|
||||||
|
):
|
||||||
|
"""Upload a new document"""
|
||||||
|
org_id = request.state.org_id
|
||||||
|
user_id = getattr(request.state, "user_id", "system")
|
||||||
|
logger.info(f"Upload request - org_id: {org_id}, user_id: {user_id}, filename: {file.filename}")
|
||||||
|
|
||||||
|
# Detect content type
|
||||||
|
detected_content_type = utils.detect_content_type(file)
|
||||||
|
logger.info(f"Detected content type: {detected_content_type}")
|
||||||
|
|
||||||
|
# Detect document type
|
||||||
|
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||||
|
if not document_type:
|
||||||
|
logger.error(f"Unsupported document type: {file.filename}")
|
||||||
|
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||||
|
|
||||||
|
# Get file size
|
||||||
|
file.file.seek(0, os.SEEK_END)
|
||||||
|
file_size = file.file.tell()
|
||||||
|
file.file.seek(0)
|
||||||
|
|
||||||
|
# Validate file size
|
||||||
|
utils.validate_file_size(file_size, document_type)
|
||||||
|
|
||||||
|
# Generate document ID and S3 key
|
||||||
|
document_id = utils.generate_document_id()
|
||||||
|
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||||
|
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||||
|
|
||||||
|
# Prepare metadata
|
||||||
|
metadata_dict = {
|
||||||
|
"org_id": org_id,
|
||||||
|
"document_type": document_type.value,
|
||||||
|
"filename": file.filename,
|
||||||
|
"file_size": str(file_size),
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Upload to S3
|
||||||
|
try:
|
||||||
|
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||||
|
logger.info(f"File uploaded successfully: {s3_key}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to upload file: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to upload file: {e}")
|
||||||
|
|
||||||
|
# Generate download URL
|
||||||
|
download_url = s3.presigned_download_url(s3_key)
|
||||||
|
|
||||||
|
# Create metadata response
|
||||||
|
metadata = DocumentMetadata(
|
||||||
|
document_id=document_id,
|
||||||
|
org_id=org_id,
|
||||||
|
document_type=document_type,
|
||||||
|
filename=file.filename,
|
||||||
|
content_type=detected_content_type,
|
||||||
|
file_size=file_size,
|
||||||
|
s3_key=s3_key,
|
||||||
|
created_at=datetime.utcnow(),
|
||||||
|
updated_at=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Upload completed - document_id: {document_id}")
|
||||||
|
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||||
|
|
||||||
|
@router.put("/{document_id}", response_model=UploadResponse)
|
||||||
|
async def rewrite_document(
|
||||||
|
request: Request,
|
||||||
|
document_id: str,
|
||||||
|
file: UploadFile = File(...)
|
||||||
|
):
|
||||||
|
"""Rewrite/replace an existing document"""
|
||||||
|
org_id = request.state.org_id
|
||||||
|
user_id = getattr(request.state, "user_id", "system")
|
||||||
|
logger.info(f"Rewrite request - document_id: {document_id}, org_id: {org_id}, user_id: {user_id}")
|
||||||
|
|
||||||
|
# Detect content type
|
||||||
|
detected_content_type = utils.detect_content_type(file)
|
||||||
|
|
||||||
|
# Detect document type
|
||||||
|
document_type = utils.detect_document_type(file.filename, detected_content_type)
|
||||||
|
if not document_type:
|
||||||
|
raise HTTPException(status_code=415, detail="Unsupported document type")
|
||||||
|
|
||||||
|
# Get file size
|
||||||
|
file.file.seek(0, os.SEEK_END)
|
||||||
|
file_size = file.file.tell()
|
||||||
|
file.file.seek(0)
|
||||||
|
|
||||||
|
# Validate file size
|
||||||
|
utils.validate_file_size(file_size, document_type)
|
||||||
|
|
||||||
|
# Generate S3 key
|
||||||
|
sanitized_filename = utils.sanitize_filename(file.filename)
|
||||||
|
s3_key = utils.document_s3_key(org_id, document_id, sanitized_filename)
|
||||||
|
|
||||||
|
# Check if document exists
|
||||||
|
if not s3.file_exists(s3_key):
|
||||||
|
logger.error(f"Document not found: {document_id}")
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
# Verify org_id matches
|
||||||
|
existing_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
if existing_metadata.get("org_id") != org_id:
|
||||||
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
|
# Prepare metadata
|
||||||
|
metadata_dict = {
|
||||||
|
"org_id": org_id,
|
||||||
|
"document_type": document_type.value,
|
||||||
|
"filename": file.filename,
|
||||||
|
"file_size": str(file_size),
|
||||||
|
"updated_at": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Upload to S3 (overwrites existing)
|
||||||
|
try:
|
||||||
|
s3.upload_file(file, s3_key, detected_content_type, metadata_dict)
|
||||||
|
logger.info(f"File rewritten successfully: {s3_key}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to rewrite file: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to rewrite file: {e}")
|
||||||
|
|
||||||
|
# Generate download URL
|
||||||
|
download_url = s3.presigned_download_url(s3_key)
|
||||||
|
|
||||||
|
# Create metadata response
|
||||||
|
metadata = DocumentMetadata(
|
||||||
|
document_id=document_id,
|
||||||
|
org_id=org_id,
|
||||||
|
document_type=document_type,
|
||||||
|
filename=file.filename,
|
||||||
|
content_type=detected_content_type,
|
||||||
|
file_size=file_size,
|
||||||
|
s3_key=s3_key,
|
||||||
|
created_at=datetime.fromisoformat(existing_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||||
|
updated_at=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Rewrite completed - document_id: {document_id}")
|
||||||
|
return UploadResponse(document_id=document_id, metadata=metadata, download_url=download_url)
|
||||||
|
|
||||||
|
@router.get("/{document_id}", response_model=DocumentMetadata)
|
||||||
|
async def get_document(request: Request, document_id: str):
|
||||||
|
"""Get document metadata"""
|
||||||
|
org_id = request.state.org_id
|
||||||
|
logger.info(f"Get document request - document_id: {document_id}, org_id: {org_id}")
|
||||||
|
|
||||||
|
# List objects to find the document
|
||||||
|
client = s3.get_client()
|
||||||
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.list_objects_v2(
|
||||||
|
Bucket=settings.s3_bucket,
|
||||||
|
Prefix=prefix,
|
||||||
|
MaxKeys=1
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to list objects: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||||
|
|
||||||
|
if not response.get("Contents"):
|
||||||
|
logger.error(f"Document not found: {document_id}")
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
s3_key = response["Contents"][0]["Key"]
|
||||||
|
|
||||||
|
# Get metadata from S3
|
||||||
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
|
||||||
|
# Verify org_id matches
|
||||||
|
if s3_metadata.get("org_id") != org_id:
|
||||||
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
|
# Get object info
|
||||||
|
try:
|
||||||
|
object_info = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get object info: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||||
|
|
||||||
|
# Create metadata response
|
||||||
|
metadata = DocumentMetadata(
|
||||||
|
document_id=document_id,
|
||||||
|
org_id=s3_metadata.get("org_id"),
|
||||||
|
document_type=DocumentType(s3_metadata.get("document_type")),
|
||||||
|
filename=s3_metadata.get("filename"),
|
||||||
|
content_type=object_info.get("ContentType"),
|
||||||
|
file_size=int(s3_metadata.get("file_size", object_info.get("ContentLength", 0))),
|
||||||
|
s3_key=s3_key,
|
||||||
|
created_at=datetime.fromisoformat(s3_metadata.get("created_at", datetime.utcnow().isoformat())),
|
||||||
|
updated_at=datetime.fromisoformat(s3_metadata.get("updated_at", datetime.utcnow().isoformat()))
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Get document completed - document_id: {document_id}")
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
@router.get("/{document_id}/download-url", response_model=DownloadUrlResponse)
|
||||||
|
async def get_download_url(request: Request, document_id: str, expires_in: int = 3600):
|
||||||
|
"""Get presigned download URL"""
|
||||||
|
org_id = request.state.org_id
|
||||||
|
logger.info(f"Download URL request - document_id: {document_id}, org_id: {org_id}")
|
||||||
|
|
||||||
|
# List objects to find the document
|
||||||
|
client = s3.get_client()
|
||||||
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.list_objects_v2(
|
||||||
|
Bucket=settings.s3_bucket,
|
||||||
|
Prefix=prefix,
|
||||||
|
MaxKeys=1
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to list objects: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||||
|
|
||||||
|
if not response.get("Contents"):
|
||||||
|
logger.error(f"Document not found: {document_id}")
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
s3_key = response["Contents"][0]["Key"]
|
||||||
|
|
||||||
|
# Verify org_id matches
|
||||||
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
if s3_metadata.get("org_id") != org_id:
|
||||||
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
|
# Generate download URL
|
||||||
|
download_url = s3.presigned_download_url(s3_key, expires_in)
|
||||||
|
|
||||||
|
logger.info(f"Download URL generated - document_id: {document_id}")
|
||||||
|
return DownloadUrlResponse(download_url=download_url, s3_key=s3_key, expires_in=expires_in)
|
||||||
|
|
||||||
|
@router.get("/{document_id}/fields", response_model=FieldsResponse)
|
||||||
|
async def get_document_fields(request: Request, document_id: str):
|
||||||
|
"""Get PDF form fields (PDF only)"""
|
||||||
|
org_id = request.state.org_id
|
||||||
|
logger.info(f"Fields request - document_id: {document_id}, org_id: {org_id}")
|
||||||
|
|
||||||
|
# List objects to find the document
|
||||||
|
client = s3.get_client()
|
||||||
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.list_objects_v2(
|
||||||
|
Bucket=settings.s3_bucket,
|
||||||
|
Prefix=prefix,
|
||||||
|
MaxKeys=1
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to list objects: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||||
|
|
||||||
|
if not response.get("Contents"):
|
||||||
|
logger.error(f"Document not found: {document_id}")
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
s3_key = response["Contents"][0]["Key"]
|
||||||
|
|
||||||
|
# Get metadata
|
||||||
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
|
||||||
|
# Verify org_id matches
|
||||||
|
if s3_metadata.get("org_id") != org_id:
|
||||||
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
|
# Check if PDF
|
||||||
|
document_type = s3_metadata.get("document_type")
|
||||||
|
if document_type != DocumentType.PDF.value:
|
||||||
|
logger.error(f"Document is not PDF: {document_type}")
|
||||||
|
raise HTTPException(status_code=400, detail="Field discovery only supported for PDF documents")
|
||||||
|
|
||||||
|
# Download and discover fields
|
||||||
|
try:
|
||||||
|
pdf_path = s3.download_to_temp(s3_key)
|
||||||
|
fields = pdf.discover_fields(pdf_path)
|
||||||
|
logger.info(f"Fields discovered - document_id: {document_id}, count: {len(fields)}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to discover fields: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to discover fields: {e}")
|
||||||
|
finally:
|
||||||
|
if os.path.exists(pdf_path):
|
||||||
|
os.unlink(pdf_path)
|
||||||
|
|
||||||
|
return FieldsResponse(
|
||||||
|
document_id=document_id,
|
||||||
|
document_type=DocumentType.PDF,
|
||||||
|
fields=fields
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.delete("/{document_id}")
|
||||||
|
async def delete_document(request: Request, document_id: str):
|
||||||
|
"""Delete document"""
|
||||||
|
org_id = request.state.org_id
|
||||||
|
logger.info(f"Delete request - document_id: {document_id}, org_id: {org_id}")
|
||||||
|
|
||||||
|
# List objects to find the document
|
||||||
|
client = s3.get_client()
|
||||||
|
prefix = utils.s3_path_prefix(org_id, document_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.list_objects_v2(
|
||||||
|
Bucket=settings.s3_bucket,
|
||||||
|
Prefix=prefix,
|
||||||
|
MaxKeys=1
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to list objects: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to retrieve document")
|
||||||
|
|
||||||
|
if not response.get("Contents"):
|
||||||
|
logger.error(f"Document not found: {document_id}")
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
s3_key = response["Contents"][0]["Key"]
|
||||||
|
|
||||||
|
# Verify org_id matches
|
||||||
|
s3_metadata = s3.get_file_metadata(s3_key)
|
||||||
|
if s3_metadata.get("org_id") != org_id:
|
||||||
|
logger.error(f"Organization mismatch for document: {document_id}")
|
||||||
|
raise HTTPException(status_code=403, detail="Organization mismatch")
|
||||||
|
|
||||||
|
# Delete from S3
|
||||||
|
try:
|
||||||
|
s3.delete_file(s3_key)
|
||||||
|
logger.info(f"Document deleted - document_id: {document_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to delete document: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to delete document: {e}")
|
||||||
|
|
||||||
|
return {"message": "Document deleted successfully"}
|
||||||
101
app/s3.py
Normal file
101
app/s3.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import boto3
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from botocore.client import Config
|
||||||
|
from fastapi import UploadFile
|
||||||
|
from app.config import settings
|
||||||
|
from app.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
def get_client():
|
||||||
|
return boto3.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=settings.s3_endpoint,
|
||||||
|
aws_access_key_id=settings.s3_access_key,
|
||||||
|
aws_secret_access_key=settings.s3_secret_key,
|
||||||
|
config=Config(signature_version="s3v4"),
|
||||||
|
region_name=settings.s3_region
|
||||||
|
)
|
||||||
|
|
||||||
|
def ensure_bucket_exists() -> None:
|
||||||
|
"""Ensure the S3 bucket exists, create it if it doesn't exist.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If bucket creation fails (service will fail to start)
|
||||||
|
"""
|
||||||
|
client = get_client()
|
||||||
|
try:
|
||||||
|
client.head_bucket(Bucket=settings.s3_bucket)
|
||||||
|
logger.info(f"Bucket '{settings.s3_bucket}' already exists")
|
||||||
|
except client.exceptions.ClientError as e:
|
||||||
|
error_code = e.response['Error']['Code']
|
||||||
|
if error_code == '404':
|
||||||
|
try:
|
||||||
|
client.create_bucket(
|
||||||
|
Bucket=settings.s3_bucket,
|
||||||
|
CreateBucketConfiguration={
|
||||||
|
'LocationConstraint': settings.s3_region
|
||||||
|
}
|
||||||
|
)
|
||||||
|
logger.info(f"Created bucket '{settings.s3_bucket}'")
|
||||||
|
except Exception as create_error:
|
||||||
|
logger.error(f"Failed to create bucket '{settings.s3_bucket}': {create_error}")
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
logger.error(f"Error checking bucket: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def upload_file(file: UploadFile, s3_key: str, content_type: str, metadata: dict = None) -> str:
|
||||||
|
"""Upload file to S3 with metadata"""
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
extra_args = {"ContentType": content_type}
|
||||||
|
if metadata:
|
||||||
|
extra_args["Metadata"] = metadata
|
||||||
|
|
||||||
|
client.upload_fileobj(
|
||||||
|
file.file,
|
||||||
|
settings.s3_bucket,
|
||||||
|
s3_key,
|
||||||
|
ExtraArgs=extra_args
|
||||||
|
)
|
||||||
|
return s3_key
|
||||||
|
|
||||||
|
def delete_file(s3_key: str) -> None:
|
||||||
|
"""Delete file from S3"""
|
||||||
|
client = get_client()
|
||||||
|
client.delete_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||||
|
|
||||||
|
def file_exists(s3_key: str) -> bool:
|
||||||
|
"""Check if file exists in S3"""
|
||||||
|
client = get_client()
|
||||||
|
try:
|
||||||
|
client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||||
|
return True
|
||||||
|
except client.exceptions.ClientError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_file_metadata(s3_key: str) -> dict:
|
||||||
|
"""Get file metadata from S3"""
|
||||||
|
client = get_client()
|
||||||
|
response = client.head_object(Bucket=settings.s3_bucket, Key=s3_key)
|
||||||
|
return response.get("Metadata", {})
|
||||||
|
|
||||||
|
def download_to_temp(s3_key: str) -> str:
|
||||||
|
"""Download file from S3 to temp file"""
|
||||||
|
client = get_client()
|
||||||
|
suffix = os.path.splitext(s3_key)[-1] or ".tmp"
|
||||||
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
||||||
|
client.download_fileobj(settings.s3_bucket, s3_key, tmp)
|
||||||
|
tmp.close()
|
||||||
|
return tmp.name
|
||||||
|
|
||||||
|
def presigned_download_url(s3_key: str, expires_in: int = 3600) -> str:
|
||||||
|
"""Generate presigned download URL"""
|
||||||
|
client = get_client()
|
||||||
|
return client.generate_presigned_url(
|
||||||
|
"get_object",
|
||||||
|
Params={"Bucket": settings.s3_bucket, "Key": s3_key},
|
||||||
|
ExpiresIn=expires_in
|
||||||
|
)
|
||||||
66
app/utils.py
Normal file
66
app/utils.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import uuid
|
||||||
|
import magic
|
||||||
|
from fastapi import HTTPException, UploadFile
|
||||||
|
from app.config import settings
|
||||||
|
from app.enums import DocumentType
|
||||||
|
|
||||||
|
def generate_document_id() -> str:
|
||||||
|
"""Generate UUID for document"""
|
||||||
|
return str(uuid.uuid4())
|
||||||
|
|
||||||
|
def s3_path_prefix(org_id: str, document_id: str) -> str:
|
||||||
|
"""Generate S3 path prefix for document operations"""
|
||||||
|
return f"documents/{org_id}/{document_id}/"
|
||||||
|
|
||||||
|
def detect_content_type(file: UploadFile) -> str:
|
||||||
|
"""Detect content type using python-magic"""
|
||||||
|
file.file.seek(0)
|
||||||
|
content = file.file.read(2048)
|
||||||
|
file.file.seek(0)
|
||||||
|
|
||||||
|
mime = magic.Magic(mime=True)
|
||||||
|
return mime.from_buffer(content)
|
||||||
|
|
||||||
|
def detect_document_type(filename: str, content_type: str) -> DocumentType:
|
||||||
|
"""Detect document type from filename and content type"""
|
||||||
|
# Try content type first
|
||||||
|
doc_type = DocumentType.from_mime_type(content_type)
|
||||||
|
if doc_type:
|
||||||
|
return doc_type
|
||||||
|
|
||||||
|
# Fall back to extension
|
||||||
|
return DocumentType.from_extension(filename)
|
||||||
|
|
||||||
|
def get_file_size_limit(document_type: DocumentType) -> int:
|
||||||
|
"""Get max file size for document type"""
|
||||||
|
limits = {
|
||||||
|
DocumentType.PDF: settings.max_file_size_pdf,
|
||||||
|
DocumentType.DOCX: settings.max_file_size_docx,
|
||||||
|
DocumentType.XLSX: settings.max_file_size_xlsx,
|
||||||
|
DocumentType.JPG: settings.max_file_size_jpg,
|
||||||
|
DocumentType.JPEG: settings.max_file_size_jpeg,
|
||||||
|
DocumentType.PNG: settings.max_file_size_png,
|
||||||
|
DocumentType.GIF: settings.max_file_size_gif,
|
||||||
|
}
|
||||||
|
return limits.get(document_type, settings.max_file_size_default)
|
||||||
|
|
||||||
|
def validate_file_size(file_size: int, document_type: DocumentType) -> None:
|
||||||
|
"""Validate file size against limits"""
|
||||||
|
max_size = get_file_size_limit(document_type)
|
||||||
|
if file_size > max_size:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=413,
|
||||||
|
detail=f"File size {file_size} exceeds maximum {max_size} for {document_type.value}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def document_s3_key(org_id: str, document_id: str, filename: str) -> str:
|
||||||
|
"""Generate S3 key for document"""
|
||||||
|
return f"{s3_path_prefix(org_id, document_id)}{filename}"
|
||||||
|
|
||||||
|
def sanitize_filename(filename: str) -> str:
|
||||||
|
"""Sanitize filename for S3"""
|
||||||
|
# Remove path separators and special characters
|
||||||
|
filename = filename.replace("/", "_").replace("\\", "_")
|
||||||
|
# Keep only safe characters
|
||||||
|
safe_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_")
|
||||||
|
return "".join(c for c in filename if c in safe_chars)
|
||||||
99
flake.lock
generated
Normal file
99
flake.lock
generated
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1776548001,
|
||||||
|
"narHash": "sha256-ZSK0NL4a1BwVbbTBoSnWgbJy9HeZFXLYQizjb2DPF24=",
|
||||||
|
"owner": "nixos",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "b12141ef619e0a9c1c84dc8c684040326f27cdcc",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nixos",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pyproject-build-systems": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
],
|
||||||
|
"pyproject-nix": [
|
||||||
|
"pyproject-nix"
|
||||||
|
],
|
||||||
|
"uv2nix": [
|
||||||
|
"uv2nix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1776659114,
|
||||||
|
"narHash": "sha256-qapCOQmR++yZSY43dzrp3wCrkOTLpod+ONtJWBk6iKU=",
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "build-system-pkgs",
|
||||||
|
"rev": "ffaa2161dd5d63e0e94591f86b54fc239660fb2e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "build-system-pkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pyproject-nix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1776715674,
|
||||||
|
"narHash": "sha256-Gs1VnEkCkkRZxJQAC/Dhz0Jbfi22mFXChbtNg9w/Ybg=",
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "pyproject.nix",
|
||||||
|
"rev": "69f57f27e52a87c54e28138a75ec741cd46663c9",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "pyproject.nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": "nixpkgs",
|
||||||
|
"pyproject-build-systems": "pyproject-build-systems",
|
||||||
|
"pyproject-nix": "pyproject-nix",
|
||||||
|
"uv2nix": "uv2nix"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"uv2nix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
],
|
||||||
|
"pyproject-nix": [
|
||||||
|
"pyproject-nix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1776718528,
|
||||||
|
"narHash": "sha256-XeGmo/BhkFXd8vVyendr3X4mQmw7CEkeQcpy7AHbVcg=",
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "uv2nix",
|
||||||
|
"rev": "60982c30e16db3e0cba6c0ed13f0894b06ab2bf1",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "uv2nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
143
flake.nix
Normal file
143
flake.nix
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
{
|
||||||
|
description = "document-service using uv2nix";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||||
|
|
||||||
|
pyproject-nix = {
|
||||||
|
url = "github:pyproject-nix/pyproject.nix";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
};
|
||||||
|
|
||||||
|
uv2nix = {
|
||||||
|
url = "github:pyproject-nix/uv2nix";
|
||||||
|
inputs.pyproject-nix.follows = "pyproject-nix";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
};
|
||||||
|
|
||||||
|
pyproject-build-systems = {
|
||||||
|
url = "github:pyproject-nix/build-system-pkgs";
|
||||||
|
inputs.pyproject-nix.follows = "pyproject-nix";
|
||||||
|
inputs.uv2nix.follows = "uv2nix";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs =
|
||||||
|
{
|
||||||
|
nixpkgs,
|
||||||
|
pyproject-nix,
|
||||||
|
uv2nix,
|
||||||
|
pyproject-build-systems,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
inherit (nixpkgs) lib;
|
||||||
|
forAllSystems = lib.genAttrs lib.systems.flakeExposed;
|
||||||
|
|
||||||
|
workspace = uv2nix.lib.workspace.loadWorkspace { workspaceRoot = ./.; };
|
||||||
|
|
||||||
|
overlay = workspace.mkPyprojectOverlay {
|
||||||
|
sourcePreference = "wheel";
|
||||||
|
};
|
||||||
|
|
||||||
|
editableOverlay = workspace.mkEditablePyprojectOverlay {
|
||||||
|
root = "$REPO_ROOT";
|
||||||
|
};
|
||||||
|
|
||||||
|
pythonSets = forAllSystems (
|
||||||
|
system:
|
||||||
|
let
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
python = pkgs.python3;
|
||||||
|
in
|
||||||
|
(pkgs.callPackage pyproject-nix.build.packages {
|
||||||
|
inherit python;
|
||||||
|
}).overrideScope
|
||||||
|
(
|
||||||
|
lib.composeManyExtensions [
|
||||||
|
pyproject-build-systems.overlays.wheel
|
||||||
|
overlay
|
||||||
|
]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
in
|
||||||
|
{
|
||||||
|
devShells = forAllSystems (
|
||||||
|
system:
|
||||||
|
let
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
pythonSet = pythonSets.${system}.overrideScope editableOverlay;
|
||||||
|
virtualenv = pythonSet.mkVirtualEnv "document-service-dev-env" workspace.deps.all;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
default = pkgs.mkShell {
|
||||||
|
packages = [
|
||||||
|
virtualenv
|
||||||
|
pkgs.uv
|
||||||
|
pkgs.pyright
|
||||||
|
pkgs.file
|
||||||
|
];
|
||||||
|
env = {
|
||||||
|
UV_NO_SYNC = "1";
|
||||||
|
UV_PYTHON = pythonSet.python.interpreter;
|
||||||
|
UV_PYTHON_DOWNLOADS = "never";
|
||||||
|
LD_LIBRARY_PATH = "${pkgs.file.out}/lib:$LD_LIBRARY_PATH";
|
||||||
|
};
|
||||||
|
shellHook = ''
|
||||||
|
unset PYTHONPATH
|
||||||
|
export REPO_ROOT=$(git rev-parse --show-toplevel)
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
packages = forAllSystems (system: let
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
pythonSet = pythonSets.${system}.overrideScope editableOverlay;
|
||||||
|
virtualenv = pythonSet.mkVirtualEnv "document-service-env" workspace.deps.default;
|
||||||
|
|
||||||
|
# Create a derivation that includes the application code
|
||||||
|
appCode = pkgs.stdenv.mkDerivation {
|
||||||
|
name = "document-service-code";
|
||||||
|
src = ./.;
|
||||||
|
installPhase = ''
|
||||||
|
mkdir -p $out/app
|
||||||
|
cp -r app/* $out/app/
|
||||||
|
cp pyproject.toml $out/
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in {
|
||||||
|
default = virtualenv;
|
||||||
|
dockerImage = pkgs.dockerTools.buildLayeredImage {
|
||||||
|
name = "document-service";
|
||||||
|
contents = [
|
||||||
|
virtualenv
|
||||||
|
pkgs.bashInteractive
|
||||||
|
pkgs.busybox
|
||||||
|
pkgs.shadow
|
||||||
|
pkgs.file
|
||||||
|
pkgs.git # Include git for version info
|
||||||
|
appCode # Include application code
|
||||||
|
];
|
||||||
|
config = {
|
||||||
|
Cmd = ["/bin/python" "-m" "uvicorn" "app.main:app" "--host" "0.0.0.0" "--port" "8082"];
|
||||||
|
Env = [
|
||||||
|
"PYTHONUNBUFFERED=1"
|
||||||
|
"PYTHONPATH=/app"
|
||||||
|
"S3_ENDPOINT"
|
||||||
|
"S3_ACCESS_KEY"
|
||||||
|
"S3_SECRET_KEY"
|
||||||
|
"S3_BUCKET"
|
||||||
|
"S3_REGION"
|
||||||
|
"HOST"
|
||||||
|
"PORT"
|
||||||
|
"LOG_LEVEL"
|
||||||
|
];
|
||||||
|
WorkingDir = "/app";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
};
|
||||||
|
}
|
||||||
14
ops/chart/Chart.yaml
Normal file
14
ops/chart/Chart.yaml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
apiVersion: v2
|
||||||
|
name: document-service
|
||||||
|
description: Generic document management service
|
||||||
|
type: application
|
||||||
|
version: 1.0.0
|
||||||
|
appVersion: "1.0.0"
|
||||||
|
keywords:
|
||||||
|
- python
|
||||||
|
- fastapi
|
||||||
|
- document-management
|
||||||
|
dependencies:
|
||||||
|
- name: common
|
||||||
|
version: "4.6.2"
|
||||||
|
repository: https://bjw-s-labs.github.io/helm-charts/
|
||||||
78
ops/chart/values.yaml
Normal file
78
ops/chart/values.yaml
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
controllers:
|
||||||
|
main:
|
||||||
|
enabled: true
|
||||||
|
type: deployment
|
||||||
|
replicas: 1
|
||||||
|
containers:
|
||||||
|
main:
|
||||||
|
image:
|
||||||
|
repository: gitea.corredorconect.com/software-engineering/document-service
|
||||||
|
tag: '{{ $.Chart.AppVersion }}'
|
||||||
|
env:
|
||||||
|
LOG_LEVEL: info
|
||||||
|
PORT: "8082"
|
||||||
|
S3_ENDPOINT:
|
||||||
|
value: "http://minio:9000"
|
||||||
|
S3_ACCESS_KEY:
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: '{{ include "bjw-s.common.lib.chart.names.fullname" $ }}-secrets'
|
||||||
|
key: s3AccessKey
|
||||||
|
S3_SECRET_KEY:
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-secrets'
|
||||||
|
key: s3SecretKey
|
||||||
|
S3_BUCKET:
|
||||||
|
value: "document-bucket"
|
||||||
|
S3_REGION:
|
||||||
|
value: "us-east-1"
|
||||||
|
probes:
|
||||||
|
liveness:
|
||||||
|
enabled: true
|
||||||
|
custom: true
|
||||||
|
spec:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8082
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
readiness:
|
||||||
|
enabled: true
|
||||||
|
custom: true
|
||||||
|
spec:
|
||||||
|
httpGet:
|
||||||
|
path: /health/ready
|
||||||
|
port: 8082
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
|
||||||
|
service:
|
||||||
|
main:
|
||||||
|
controller: main
|
||||||
|
type: ClusterIP
|
||||||
|
ports:
|
||||||
|
http:
|
||||||
|
port: 8082
|
||||||
|
protocol: HTTP
|
||||||
|
|
||||||
|
external-secret:
|
||||||
|
enabled: true
|
||||||
|
apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
suffix: secrets
|
||||||
|
spec:
|
||||||
|
spec:
|
||||||
|
refreshInterval: 0s
|
||||||
|
secretStoreRef:
|
||||||
|
name: cluster-secrets-store
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
target:
|
||||||
|
name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-secrets'
|
||||||
|
creationPolicy: Owner
|
||||||
|
dataFrom:
|
||||||
|
- sourceRef:
|
||||||
|
generatorRef:
|
||||||
|
apiVersion: generators.external-secrets.io/v1alpha1
|
||||||
|
kind: Password
|
||||||
|
name: '{{ include "bjw.common.lib.chart.names.fullname $ }}-password-generator'
|
||||||
32
pyproject.toml
Normal file
32
pyproject.toml
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
[project]
|
||||||
|
name = "document-service"
|
||||||
|
version = "1.0.0"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.115.0",
|
||||||
|
"uvicorn[standard]>=0.30.0",
|
||||||
|
"pypdf>=4.3.1",
|
||||||
|
"boto3>=1.35.0",
|
||||||
|
"python-multipart>=0.0.9",
|
||||||
|
"pydantic>=2.8.0",
|
||||||
|
"pydantic-settings>=2.4.0",
|
||||||
|
"python-magic>=0.4.27",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling", "editables"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["app"]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"ruff>=0.6.0",
|
||||||
|
"pytest>=8.0.0",
|
||||||
|
"pytest-asyncio>=0.23.0",
|
||||||
|
"httpx>=0.27.0",
|
||||||
|
"reportlab>=4.0.0",
|
||||||
|
"pypdf>=4.3.1",
|
||||||
|
"moto>=5.0.0",
|
||||||
|
]
|
||||||
68
tests/conftest.py
Normal file
68
tests/conftest.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""
|
||||||
|
Test configuration and fixtures for document-service tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import Mock, patch
|
||||||
|
from moto import mock_aws
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
from app.main import app
|
||||||
|
|
||||||
|
# Test data paths
|
||||||
|
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_client():
|
||||||
|
"""Create a test client with auth bypass."""
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_org_id():
|
||||||
|
"""Sample organization ID for testing."""
|
||||||
|
return "test-org-123"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_document_id():
|
||||||
|
"""Sample document ID for testing."""
|
||||||
|
return "test-doc-456"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_pdf_files():
|
||||||
|
"""Paths to test PDF files."""
|
||||||
|
return {
|
||||||
|
"simple_form": os.path.join(FIXTURES_DIR, "simple_form.pdf"),
|
||||||
|
"complex_form": os.path.join(FIXTURES_DIR, "complex_form.pdf"),
|
||||||
|
"no_form": os.path.join(FIXTURES_DIR, "no_form.pdf"),
|
||||||
|
"large_form": os.path.join(FIXTURES_DIR, "large_form.pdf"),
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_s3_client():
|
||||||
|
"""Create a mock S3 client for testing."""
|
||||||
|
with mock_aws():
|
||||||
|
client = boto3.client(
|
||||||
|
"s3",
|
||||||
|
region_name="us-east-1",
|
||||||
|
aws_access_key_id="minioadmin",
|
||||||
|
aws_secret_access_key="minioadmin",
|
||||||
|
)
|
||||||
|
# Create test bucket
|
||||||
|
client.create_bucket(Bucket="document-bucket")
|
||||||
|
yield client
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def auth_bypass_middleware():
|
||||||
|
"""Fixture to bypass auth middleware in tests."""
|
||||||
|
def bypass_auth(request):
|
||||||
|
request.state.org_id = "test-org-123"
|
||||||
|
return request
|
||||||
|
|
||||||
|
return bypass_auth
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_auth_token():
|
||||||
|
"""Sample auth token for testing."""
|
||||||
|
return "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJvcmdfaWQiOiJ0ZXN0LW9yZy0xMjMifQ.test"
|
||||||
304
tests/fixtures/complex_form.pdf
vendored
Normal file
304
tests/fixtures/complex_form.pdf
vendored
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
%PDF-1.3
|
||||||
|
%âãÏÓ
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Producer (pypdf)
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Pages
|
||||||
|
/Count 1
|
||||||
|
/Kids [ 4 0 R ]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Catalog
|
||||||
|
/Pages 2 0 R
|
||||||
|
/AcroForm <<
|
||||||
|
/Fields [ <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (first\137name)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 690 400 710 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (last\137name)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 640 400 660 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (email)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 590 400 610 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (phone)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 540 400 560 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (address)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 490 400 510 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (city)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 440 400 460 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (state)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 390 400 410 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (zip\137code)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 340 400 360 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Ch
|
||||||
|
/T (country)
|
||||||
|
/V ()
|
||||||
|
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||||
|
/Rect [ 200 290 400 310 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (gender)
|
||||||
|
/V (male)
|
||||||
|
/Rect [ 200 240 220 260 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (gender)
|
||||||
|
/V (female)
|
||||||
|
/Rect [ 300 240 320 260 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (reading)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 190 220 210 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (sports)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 160 220 180 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (music)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 130 220 150 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (travel)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 100 220 120 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (agree\137terms)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 140 220 160 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (signature)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 90 400 110 ]
|
||||||
|
/Ff 0
|
||||||
|
>> ]
|
||||||
|
>>
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 5 0 R
|
||||||
|
/MediaBox [ 0 0 612 792 ]
|
||||||
|
/Resources <<
|
||||||
|
/Font 6 0 R
|
||||||
|
/ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>>
|
||||||
|
/Rotate 0
|
||||||
|
/Trans <<
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
/Parent 2 0 R
|
||||||
|
/Annots [ <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (first\137name)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 690 400 710 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (last\137name)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 640 400 660 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (email)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 590 400 610 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (phone)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 540 400 560 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (address)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 490 400 510 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (city)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 440 400 460 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (state)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 390 400 410 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (zip\137code)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 340 400 360 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Ch
|
||||||
|
/T (country)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 290 400 310 ]
|
||||||
|
/Ff 0
|
||||||
|
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (gender)
|
||||||
|
/V (male)
|
||||||
|
/Rect [ 200 240 220 260 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (gender)
|
||||||
|
/V (female)
|
||||||
|
/Rect [ 300 240 320 260 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (reading)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 190 220 210 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (sports)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 160 220 180 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (music)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 130 220 150 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (travel)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 100 220 120 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (agree\137terms)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 140 220 160 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (signature)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 90 400 110 ]
|
||||||
|
/Ff 0
|
||||||
|
>> ]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ]
|
||||||
|
/Length 291
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
GasbV_+Fea&;KY%MZ9UrC9m8.oN"UdKHc".Gmj%B,>D(A;p`!tWO(4\)'k<]nE'P8R95j8f]2oKJNJY1f"tI,Dm8oIL>-,'An-7/XP_7&hmsPV2$VZlJVuKljga3q-e_fL*;+[hpAoJXWqmrLU,"s52O'g'kTenY-)^6!E]<t>XGGKULRl:>id?'u8b4h!>BX;G^/rC%S5.uq%27\VHe*eP7/%>f=QN:Hc+'*-ihD-.,/'o(;:.X+4s[#!Dq5i9,$f'o&NC;.U."[j3.eA/Se#D\)eRtd.%ou~>
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
<<
|
||||||
|
/F1 7 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/Name /F1
|
||||||
|
/Subtype /Type1
|
||||||
|
/Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 8
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000015 00000 n
|
||||||
|
0000000054 00000 n
|
||||||
|
0000000113 00000 n
|
||||||
|
0000001378 00000 n
|
||||||
|
0000003056 00000 n
|
||||||
|
0000003438 00000 n
|
||||||
|
0000003469 00000 n
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/Size 8
|
||||||
|
/Root 3 0 R
|
||||||
|
/Info 1 0 R
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
3576
|
||||||
|
%%EOF
|
||||||
371
tests/fixtures/generate_test_pdfs.py
vendored
Normal file
371
tests/fixtures/generate_test_pdfs.py
vendored
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
"""
|
||||||
|
Generate test PDF files for document-service testing.
|
||||||
|
|
||||||
|
This script creates various test PDFs with actual AcroForm fields:
|
||||||
|
- Simple form PDF with basic form fields
|
||||||
|
- Complex form PDF with multiple field types
|
||||||
|
- No form PDF without form fields
|
||||||
|
- Large form PDF for size validation testing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
from reportlab.lib.pagesizes import letter
|
||||||
|
from reportlab.lib import colors
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
from pypdf.generic import (
|
||||||
|
NameObject,
|
||||||
|
create_string_object,
|
||||||
|
NumberObject,
|
||||||
|
ArrayObject,
|
||||||
|
DictionaryObject,
|
||||||
|
BooleanObject,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Output directory
|
||||||
|
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
def create_simple_form_pdf():
|
||||||
|
"""Create a simple PDF with basic form fields."""
|
||||||
|
output_path = os.path.join(OUTPUT_DIR, "simple_form.pdf")
|
||||||
|
|
||||||
|
# Create base PDF with reportlab
|
||||||
|
c = canvas.Canvas(output_path, pagesize=letter)
|
||||||
|
c.setFont("Helvetica", 16)
|
||||||
|
c.drawString(100, 750, "Simple Form Test")
|
||||||
|
|
||||||
|
c.setFont("Helvetica", 12)
|
||||||
|
c.drawString(100, 700, "Name:")
|
||||||
|
c.drawString(100, 650, "Email:")
|
||||||
|
c.drawString(100, 600, "Phone:")
|
||||||
|
c.drawString(100, 550, "Country:")
|
||||||
|
c.drawString(100, 500, "Birth Date:")
|
||||||
|
c.drawString(100, 450, "Agree to Terms:")
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
|
||||||
|
# Add actual form fields using pypdf
|
||||||
|
reader = PdfReader(output_path)
|
||||||
|
writer = PdfWriter()
|
||||||
|
|
||||||
|
# Copy the page
|
||||||
|
page = reader.pages[0]
|
||||||
|
writer.add_page(page)
|
||||||
|
|
||||||
|
# Create form fields
|
||||||
|
fields = []
|
||||||
|
|
||||||
|
# Name field (text)
|
||||||
|
name_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Tx"),
|
||||||
|
NameObject("/T"): create_string_object("name"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(690), NumberObject(400), NumberObject(710)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(name_field)
|
||||||
|
|
||||||
|
# Email field (text)
|
||||||
|
email_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Tx"),
|
||||||
|
NameObject("/T"): create_string_object("email"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(640), NumberObject(400), NumberObject(660)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(email_field)
|
||||||
|
|
||||||
|
# Phone field (text)
|
||||||
|
phone_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Tx"),
|
||||||
|
NameObject("/T"): create_string_object("phone"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(590), NumberObject(400), NumberObject(610)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(phone_field)
|
||||||
|
|
||||||
|
# Country field (dropdown/choice)
|
||||||
|
country_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Ch"),
|
||||||
|
NameObject("/T"): create_string_object("country"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Opt"): ArrayObject([
|
||||||
|
create_string_object("USA"),
|
||||||
|
create_string_object("Canada"),
|
||||||
|
create_string_object("UK"),
|
||||||
|
create_string_object("Germany"),
|
||||||
|
create_string_object("France"),
|
||||||
|
]),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(540), NumberObject(400), NumberObject(560)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(country_field)
|
||||||
|
|
||||||
|
# Birth date field (text)
|
||||||
|
birth_date_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Tx"),
|
||||||
|
NameObject("/T"): create_string_object("birth_date"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(490), NumberObject(400), NumberObject(510)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(birth_date_field)
|
||||||
|
|
||||||
|
# Agree terms field (checkbox)
|
||||||
|
agree_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Btn"),
|
||||||
|
NameObject("/T"): create_string_object("agree_terms"),
|
||||||
|
NameObject("/V"): NameObject("/Off"),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(440), NumberObject(220), NumberObject(460)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(agree_field)
|
||||||
|
|
||||||
|
# Add fields to the page
|
||||||
|
page = writer.pages[0]
|
||||||
|
if "/Annots" not in page:
|
||||||
|
page[NameObject("/Annots")] = ArrayObject()
|
||||||
|
|
||||||
|
for field in fields:
|
||||||
|
field_obj = DictionaryObject({
|
||||||
|
NameObject("/Subtype"): NameObject("/Widget"),
|
||||||
|
NameObject("/FT"): field[NameObject("/FT")],
|
||||||
|
NameObject("/T"): field[NameObject("/T")],
|
||||||
|
NameObject("/V"): field.get(NameObject("/V"), NameObject("")),
|
||||||
|
NameObject("/Rect"): field[NameObject("/Rect")],
|
||||||
|
NameObject("/Ff"): field.get(NameObject("/Ff"), NumberObject(0)),
|
||||||
|
})
|
||||||
|
|
||||||
|
if NameObject("/Opt") in field:
|
||||||
|
field_obj[NameObject("/Opt")] = field[NameObject("/Opt")]
|
||||||
|
|
||||||
|
page[NameObject("/Annots")].append(field_obj)
|
||||||
|
|
||||||
|
# Add AcroForm to the document
|
||||||
|
acroform = DictionaryObject({
|
||||||
|
NameObject("/Fields"): ArrayObject(fields),
|
||||||
|
})
|
||||||
|
writer._root_object[NameObject("/AcroForm")] = acroform
|
||||||
|
|
||||||
|
# Save the PDF
|
||||||
|
with open(output_path, "wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
|
||||||
|
print(f"Created: {output_path}")
|
||||||
|
|
||||||
|
def create_complex_form_pdf():
|
||||||
|
"""Create a complex PDF with multiple field types."""
|
||||||
|
output_path = os.path.join(OUTPUT_DIR, "complex_form.pdf")
|
||||||
|
|
||||||
|
# Create base PDF with reportlab
|
||||||
|
c = canvas.Canvas(output_path, pagesize=letter)
|
||||||
|
c.setFont("Helvetica", 16)
|
||||||
|
c.drawString(100, 750, "Complex Form Test")
|
||||||
|
|
||||||
|
c.setFont("Helvetica", 12)
|
||||||
|
c.drawString(100, 700, "First Name:")
|
||||||
|
c.drawString(100, 650, "Last Name:")
|
||||||
|
c.drawString(100, 600, "Email:")
|
||||||
|
c.drawString(100, 550, "Phone:")
|
||||||
|
c.drawString(100, 500, "Address:")
|
||||||
|
c.drawString(100, 450, "City:")
|
||||||
|
c.drawString(100, 400, "State:")
|
||||||
|
c.drawString(100, 350, "Zip Code:")
|
||||||
|
c.drawString(100, 300, "Country:")
|
||||||
|
c.drawString(100, 250, "Gender:")
|
||||||
|
c.drawString(100, 200, "Interests:")
|
||||||
|
c.drawString(100, 150, "Agree to Terms:")
|
||||||
|
c.drawString(100, 100, "Signature:")
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
|
||||||
|
# Add actual form fields using pypdf
|
||||||
|
reader = PdfReader(output_path)
|
||||||
|
writer = PdfWriter()
|
||||||
|
|
||||||
|
# Copy the page
|
||||||
|
page = reader.pages[0]
|
||||||
|
writer.add_page(page)
|
||||||
|
|
||||||
|
# Create form fields
|
||||||
|
fields = []
|
||||||
|
|
||||||
|
# Text fields
|
||||||
|
text_fields = [
|
||||||
|
('first_name', 200, 690),
|
||||||
|
('last_name', 200, 640),
|
||||||
|
('email', 200, 590),
|
||||||
|
('phone', 200, 540),
|
||||||
|
('address', 200, 490),
|
||||||
|
('city', 200, 440),
|
||||||
|
('state', 200, 390),
|
||||||
|
('zip_code', 200, 340),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, x, y in text_fields:
|
||||||
|
field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Tx"),
|
||||||
|
NameObject("/T"): create_string_object(name),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(x), NumberObject(y), NumberObject(x + 200), NumberObject(y + 20)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(field)
|
||||||
|
|
||||||
|
# Country dropdown
|
||||||
|
country_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Ch"),
|
||||||
|
NameObject("/T"): create_string_object("country"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Opt"): ArrayObject([
|
||||||
|
create_string_object("USA"),
|
||||||
|
create_string_object("Canada"),
|
||||||
|
create_string_object("UK"),
|
||||||
|
create_string_object("Germany"),
|
||||||
|
create_string_object("France"),
|
||||||
|
]),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(290), NumberObject(400), NumberObject(310)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(country_field)
|
||||||
|
|
||||||
|
# Radio buttons for gender
|
||||||
|
male_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Btn"),
|
||||||
|
NameObject("/T"): create_string_object("gender"),
|
||||||
|
NameObject("/V"): create_string_object("male"),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(240), NumberObject(220), NumberObject(260)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(male_field)
|
||||||
|
|
||||||
|
female_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Btn"),
|
||||||
|
NameObject("/T"): create_string_object("gender"),
|
||||||
|
NameObject("/V"): create_string_object("female"),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(300), NumberObject(240), NumberObject(320), NumberObject(260)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(female_field)
|
||||||
|
|
||||||
|
# Checkboxes for interests
|
||||||
|
interests = ['reading', 'sports', 'music', 'travel']
|
||||||
|
for i, interest in enumerate(interests):
|
||||||
|
field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Btn"),
|
||||||
|
NameObject("/T"): create_string_object(interest),
|
||||||
|
NameObject("/V"): NameObject("/Off"),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(190 - (i * 30)), NumberObject(220), NumberObject(210 - (i * 30))]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(field)
|
||||||
|
|
||||||
|
# Checkbox for agree terms
|
||||||
|
agree_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Btn"),
|
||||||
|
NameObject("/T"): create_string_object("agree_terms"),
|
||||||
|
NameObject("/V"): NameObject("/Off"),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(140), NumberObject(220), NumberObject(160)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(agree_field)
|
||||||
|
|
||||||
|
# Signature field
|
||||||
|
signature_field = DictionaryObject({
|
||||||
|
NameObject("/FT"): NameObject("/Tx"),
|
||||||
|
NameObject("/T"): create_string_object("signature"),
|
||||||
|
NameObject("/V"): create_string_object(""),
|
||||||
|
NameObject("/Rect"): ArrayObject([NumberObject(200), NumberObject(90), NumberObject(400), NumberObject(110)]),
|
||||||
|
NameObject("/Ff"): NumberObject(0),
|
||||||
|
})
|
||||||
|
fields.append(signature_field)
|
||||||
|
|
||||||
|
# Add fields to the page
|
||||||
|
page = writer.pages[0]
|
||||||
|
if "/Annots" not in page:
|
||||||
|
page[NameObject("/Annots")] = ArrayObject()
|
||||||
|
|
||||||
|
for field in fields:
|
||||||
|
field_obj = DictionaryObject({
|
||||||
|
NameObject("/Subtype"): NameObject("/Widget"),
|
||||||
|
NameObject("/FT"): field[NameObject("/FT")],
|
||||||
|
NameObject("/T"): field[NameObject("/T")],
|
||||||
|
NameObject("/V"): field.get(NameObject("/V"), NameObject("")),
|
||||||
|
NameObject("/Rect"): field[NameObject("/Rect")],
|
||||||
|
NameObject("/Ff"): field.get(NameObject("/Ff"), NumberObject(0)),
|
||||||
|
})
|
||||||
|
|
||||||
|
if NameObject("/Opt") in field:
|
||||||
|
field_obj[NameObject("/Opt")] = field[NameObject("/Opt")]
|
||||||
|
|
||||||
|
page[NameObject("/Annots")].append(field_obj)
|
||||||
|
|
||||||
|
# Add AcroForm to the document
|
||||||
|
acroform = DictionaryObject({
|
||||||
|
NameObject("/Fields"): ArrayObject(fields),
|
||||||
|
})
|
||||||
|
writer._root_object[NameObject("/AcroForm")] = acroform
|
||||||
|
|
||||||
|
# Save the PDF
|
||||||
|
with open(output_path, "wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
|
||||||
|
print(f"Created: {output_path}")
|
||||||
|
|
||||||
|
def create_no_form_pdf():
|
||||||
|
"""Create a PDF without form fields."""
|
||||||
|
output_path = os.path.join(OUTPUT_DIR, "no_form.pdf")
|
||||||
|
|
||||||
|
# Create simple PDF without form fields
|
||||||
|
c = canvas.Canvas(output_path, pagesize=letter)
|
||||||
|
c.setFont("Helvetica", 16)
|
||||||
|
c.drawString(100, 750, "No Form Test")
|
||||||
|
|
||||||
|
c.setFont("Helvetica", 12)
|
||||||
|
c.drawString(100, 700, "This PDF has no form fields.")
|
||||||
|
c.drawString(100, 650, "It is used for testing field discovery")
|
||||||
|
c.drawString(100, 600, "on documents without AcroForm fields.")
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
|
||||||
|
print(f"Created: {output_path}")
|
||||||
|
|
||||||
|
def create_large_form_pdf():
|
||||||
|
"""Create a large PDF for size validation testing."""
|
||||||
|
output_path = os.path.join(OUTPUT_DIR, "large_form.pdf")
|
||||||
|
|
||||||
|
# Create a larger PDF with more content
|
||||||
|
c = canvas.Canvas(output_path, pagesize=letter)
|
||||||
|
c.setFont("Helvetica", 16)
|
||||||
|
c.drawString(100, 750, "Large Form Test")
|
||||||
|
|
||||||
|
c.setFont("Helvetica", 12)
|
||||||
|
y = 700
|
||||||
|
for i in range(50):
|
||||||
|
c.drawString(100, y, f"Field {i + 1}:")
|
||||||
|
y -= 50
|
||||||
|
if y < 50:
|
||||||
|
c.showPage()
|
||||||
|
y = 700
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
|
||||||
|
print(f"Created: {output_path}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Generate all test PDF files."""
|
||||||
|
print("Generating test PDF files...")
|
||||||
|
print(f"Output directory: {OUTPUT_DIR}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
create_simple_form_pdf()
|
||||||
|
create_complex_form_pdf()
|
||||||
|
create_no_form_pdf()
|
||||||
|
create_large_form_pdf()
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("All test PDF files generated successfully!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
125
tests/fixtures/large_form.pdf
vendored
Normal file
125
tests/fixtures/large_form.pdf
vendored
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
%PDF-1.3
|
||||||
|
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/F1 2 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||||
|
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>> /Rotate 0 /Trans <<
|
||||||
|
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||||
|
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>> /Rotate 0 /Trans <<
|
||||||
|
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||||
|
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>> /Rotate 0 /Trans <<
|
||||||
|
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 13 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
||||||
|
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>> /Rotate 0 /Trans <<
|
||||||
|
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<<
|
||||||
|
/PageMode /UseNone /Pages 9 0 R /Type /Catalog
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
8 0 obj
|
||||||
|
<<
|
||||||
|
/Author (anonymous) /CreationDate (D:19800101000000+00'00') /Creator (anonymous) /Keywords () /ModDate (D:19800101000000+00'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||||
|
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
9 0 obj
|
||||||
|
<<
|
||||||
|
/Count 4 /Kids [ 3 0 R 4 0 R 5 0 R 6 0 R ] /Type /Pages
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
10 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ] /Length 217
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gas30YmS?5&;9"+:GJ\L`7rI@@Oq[]V;)ju4[h(2dJ$.fMDlYNi/6XZ9/-MBqIFpH"0bWR4+VY?&JE4dmBP4$H`s>o>Pd5_5(knN-9C@@=hbnO$/KG<T]uHC6SHeT%fQ2(61,2)kB&jPeh#ln*V7]`-(1#q7P]TrOr967OBGd6R>k'EA?N"sbgn1*RGt<48$Z/.<iqdC<HBN;BdXTjQboF?~>endstream
|
||||||
|
endobj
|
||||||
|
11 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ] /Length 179
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gas30^C%h3*5qB\:N<.Pcs3$Hl<(9Sj6mHT",_O,eK?ILEeIs/+25o1W?$HFlO(jerB`1_*amY9`!,>fg-:(O.:HsM<c")brI"e6WCOT4gHTe]6:XPR3Z2,/H>lia7mi26F)k6[R>)2Tc&QO]0JmRQ33#uf(:EGYU/pYb,%W<I+0;`+EW~>endstream
|
||||||
|
endobj
|
||||||
|
12 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ] /Length 182
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gas30YmS?%'SYMZ:N8jHd+m]ZXcA"(*:Fj!$As93eK>>CO@)QnnF80POP6tcHWu&Bi%Q$",OR8C45u,jFR@u"e5F01DQMJaO6&5D+&?+Z'=%F%qt`rY;O"3#"KbqRMK6*1l<JI#\QT.g>jW9fl6'd&lDQ+4eQPFB=)/[R?*6VZ`^9D([>Kog~>endstream
|
||||||
|
endobj
|
||||||
|
13 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ] /Length 147
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gas3+3spL'$q8S#<P23]FJ9Y&V4a)bG2NT>h1+`5('Z%;U^2`KE+.t@o*+c<HmDMhfg)&^AATHdpsVmX3RhL!69O]%\U_jUJK0dDLK7_Y[]$?TK6gh*/?5bY6!78.Ms>%mcr*lWqbfg@lpOeX~>endstream
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 14
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000061 00000 n
|
||||||
|
0000000092 00000 n
|
||||||
|
0000000199 00000 n
|
||||||
|
0000000393 00000 n
|
||||||
|
0000000587 00000 n
|
||||||
|
0000000781 00000 n
|
||||||
|
0000000975 00000 n
|
||||||
|
0000001043 00000 n
|
||||||
|
0000001304 00000 n
|
||||||
|
0000001381 00000 n
|
||||||
|
0000001689 00000 n
|
||||||
|
0000001959 00000 n
|
||||||
|
0000002232 00000 n
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/ID
|
||||||
|
[<30157dc3b9cf65b8d1eaf3493559908e><30157dc3b9cf65b8d1eaf3493559908e>]
|
||||||
|
% ReportLab generated PDF document -- digest (opensource)
|
||||||
|
|
||||||
|
/Info 8 0 R
|
||||||
|
/Root 7 0 R
|
||||||
|
/Size 14
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
2470
|
||||||
|
%%EOF
|
||||||
68
tests/fixtures/no_form.pdf
vendored
Normal file
68
tests/fixtures/no_form.pdf
vendored
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
%PDF-1.3
|
||||||
|
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/F1 2 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
|
||||||
|
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>> /Rotate 0 /Trans <<
|
||||||
|
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/PageMode /UseNone /Pages 6 0 R /Type /Catalog
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/Author (anonymous) /CreationDate (D:19800101000000+00'00') /Creator (anonymous) /Keywords () /ModDate (D:19800101000000+00'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||||
|
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
<<
|
||||||
|
/Count 1 /Kids [ 3 0 R ] /Type /Pages
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ] /Length 226
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gas2Bb6l*?&4Q?lMRuh(2(>rm;UL(=iaR@%P12s;!_o]ip\#oA:h3rL(XCuYYkiVA702`\bERWLTF<pmA'bMe$GLl8m[Gp,mCZM>`irc(:k@<Q,.1t_;U3TSGL0f4RBV`'XKta+*A74'q:3;`A;r@nl60Fm[LVPtD`E'mGib0+5kmB/Rp3p#C+&@HQ1$r/^;:dZ/#koRn*nah\!>!7PW#)X61=m`OB9!~>endstream
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 8
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000061 00000 n
|
||||||
|
0000000092 00000 n
|
||||||
|
0000000199 00000 n
|
||||||
|
0000000392 00000 n
|
||||||
|
0000000460 00000 n
|
||||||
|
0000000721 00000 n
|
||||||
|
0000000780 00000 n
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/ID
|
||||||
|
[<30157dc3b9cf65b8d1eaf3493559908e><30157dc3b9cf65b8d1eaf3493559908e>]
|
||||||
|
% ReportLab generated PDF document -- digest (opensource)
|
||||||
|
|
||||||
|
/Info 5 0 R
|
||||||
|
/Root 4 0 R
|
||||||
|
/Size 8
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
1096
|
||||||
|
%%EOF
|
||||||
161
tests/fixtures/simple_form.pdf
vendored
Normal file
161
tests/fixtures/simple_form.pdf
vendored
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
%PDF-1.3
|
||||||
|
%âãÏÓ
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Producer (pypdf)
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Pages
|
||||||
|
/Count 1
|
||||||
|
/Kids [ 4 0 R ]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Catalog
|
||||||
|
/Pages 2 0 R
|
||||||
|
/AcroForm <<
|
||||||
|
/Fields [ <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (name)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 690 400 710 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (email)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 640 400 660 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (phone)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 590 400 610 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Ch
|
||||||
|
/T (country)
|
||||||
|
/V ()
|
||||||
|
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||||
|
/Rect [ 200 540 400 560 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Tx
|
||||||
|
/T (birth\137date)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 490 400 510 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/FT /Btn
|
||||||
|
/T (agree\137terms)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 440 220 460 ]
|
||||||
|
/Ff 0
|
||||||
|
>> ]
|
||||||
|
>>
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 5 0 R
|
||||||
|
/MediaBox [ 0 0 612 792 ]
|
||||||
|
/Resources <<
|
||||||
|
/Font 6 0 R
|
||||||
|
/ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>>
|
||||||
|
/Rotate 0
|
||||||
|
/Trans <<
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
/Parent 2 0 R
|
||||||
|
/Annots [ <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (name)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 690 400 710 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (email)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 640 400 660 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (phone)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 590 400 610 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Ch
|
||||||
|
/T (country)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 540 400 560 ]
|
||||||
|
/Ff 0
|
||||||
|
/Opt [ (USA) (Canada) (UK) (Germany) (France) ]
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Tx
|
||||||
|
/T (birth\137date)
|
||||||
|
/V ()
|
||||||
|
/Rect [ 200 490 400 510 ]
|
||||||
|
/Ff 0
|
||||||
|
>> <<
|
||||||
|
/Subtype /Widget
|
||||||
|
/FT /Btn
|
||||||
|
/T (agree\137terms)
|
||||||
|
/V /Off
|
||||||
|
/Rect [ 200 440 220 460 ]
|
||||||
|
/Ff 0
|
||||||
|
>> ]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ]
|
||||||
|
/Length 214
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gas3/_$YcZ&-h():[oO-KC+O7Fj&337*rSs`0Q/<`k!1:qntBjLh1!*5Q?*5,9cn2L]>4V7T^E=1'1`)j"LZXOAkYndii(Rd4^iHO@!??#S:KhY5-Hn'\Y63F`n8+K,.t]c\@9%516]H[@*&9CT1O*F'1H9T&WS2DLGjN]UaM[f"?B)-YBck(&"KsZ*@fJ2kq(gmZ1he)\4'9")1e>M#~>
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
<<
|
||||||
|
/F1 7 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/Name /F1
|
||||||
|
/Subtype /Type1
|
||||||
|
/Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 8
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000015 00000 n
|
||||||
|
0000000054 00000 n
|
||||||
|
0000000113 00000 n
|
||||||
|
0000000637 00000 n
|
||||||
|
0000001387 00000 n
|
||||||
|
0000001692 00000 n
|
||||||
|
0000001723 00000 n
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/Size 8
|
||||||
|
/Root 3 0 R
|
||||||
|
/Info 1 0 R
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
1830
|
||||||
|
%%EOF
|
||||||
464
tests/test_documents.py
Normal file
464
tests/test_documents.py
Normal file
@@ -0,0 +1,464 @@
|
|||||||
|
"""
|
||||||
|
Comprehensive test suite for document-service.
|
||||||
|
|
||||||
|
Tests document upload, retrieval, field discovery, and complete workflows.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import Mock, patch
|
||||||
|
from moto import mock_aws
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
from app.main import app
|
||||||
|
from app.pdf import discover_fields
|
||||||
|
|
||||||
|
# Test data paths
|
||||||
|
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
|
||||||
|
|
||||||
|
|
||||||
|
class TestHealthEndpoint:
|
||||||
|
"""Test health endpoint functionality."""
|
||||||
|
|
||||||
|
def test_health_endpoint(self, test_client):
|
||||||
|
"""Test health endpoint returns 200 OK."""
|
||||||
|
response = test_client.get("/health")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json() == {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentUpload:
|
||||||
|
"""Test document upload functionality."""
|
||||||
|
|
||||||
|
def test_upload_simple_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test uploading a simple PDF with form fields."""
|
||||||
|
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||||
|
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 201
|
||||||
|
result = response.json()
|
||||||
|
assert "document_id" in result
|
||||||
|
assert "metadata" in result
|
||||||
|
assert "download_url" in result
|
||||||
|
assert result["metadata"]["document_type"] == "pdf"
|
||||||
|
assert result["metadata"]["filename"] == "simple_form.pdf"
|
||||||
|
|
||||||
|
def test_upload_complex_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test uploading a complex PDF with multiple field types."""
|
||||||
|
with open(test_pdf_files["complex_form"], "rb") as f:
|
||||||
|
files = {"file": ("complex_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 201
|
||||||
|
result = response.json()
|
||||||
|
assert "document_id" in result
|
||||||
|
assert result["metadata"]["document_type"] == "pdf"
|
||||||
|
|
||||||
|
def test_upload_no_form_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test uploading a PDF without form fields."""
|
||||||
|
with open(test_pdf_files["no_form"], "rb") as f:
|
||||||
|
files = {"file": ("no_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 201
|
||||||
|
result = response.json()
|
||||||
|
assert "document_id" in result
|
||||||
|
|
||||||
|
def test_upload_without_auth_returns_401(self, test_client, test_pdf_files):
|
||||||
|
"""Test upload without auth returns 401."""
|
||||||
|
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||||
|
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
|
||||||
|
response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
assert "detail" in response.json()
|
||||||
|
|
||||||
|
def test_upload_with_invalid_auth_returns_401(self, test_client, test_pdf_files):
|
||||||
|
"""Test upload with invalid auth returns 401."""
|
||||||
|
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||||
|
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": "Invalid token"}
|
||||||
|
|
||||||
|
response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
|
||||||
|
def test_upload_missing_file_returns_400(self, test_client, sample_auth_token):
|
||||||
|
"""Test upload without file returns 400."""
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 422 # FastAPI validation error
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentMetadata:
|
||||||
|
"""Test document metadata retrieval."""
|
||||||
|
|
||||||
|
def test_get_document_metadata_success(self, test_client, sample_auth_token):
|
||||||
|
"""Test getting document metadata successfully."""
|
||||||
|
# This test would require a document to be uploaded first
|
||||||
|
# For now, we'll test the endpoint structure
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.get(
|
||||||
|
"/api/documents/test-doc-456",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||||
|
assert response.status_code in [404, 403]
|
||||||
|
|
||||||
|
def test_get_document_without_auth_returns_401(self, test_client):
|
||||||
|
"""Test getting document without auth returns 401."""
|
||||||
|
response = test_client.get("/api/documents/test-doc-456")
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
class TestDownloadUrl:
|
||||||
|
"""Test download URL generation."""
|
||||||
|
|
||||||
|
def test_get_download_url_success(self, test_client, sample_auth_token):
|
||||||
|
"""Test getting download URL successfully."""
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.get(
|
||||||
|
"/api/documents/test-doc-456/download-url",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||||
|
assert response.status_code in [404, 403]
|
||||||
|
|
||||||
|
def test_get_download_url_without_auth_returns_401(self, test_client):
|
||||||
|
"""Test getting download URL without auth returns 401."""
|
||||||
|
response = test_client.get("/api/documents/test-doc-456/download-url")
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
class TestPDFFieldDiscovery:
|
||||||
|
"""Test PDF field discovery functionality."""
|
||||||
|
|
||||||
|
def test_get_pdf_fields_simple_form(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test getting PDF fields from simple form."""
|
||||||
|
# First upload the document
|
||||||
|
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||||
|
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
upload_response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
if upload_response.status_code == 201:
|
||||||
|
document_id = upload_response.json()["document_id"]
|
||||||
|
|
||||||
|
# Get fields
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
response = test_client.get(
|
||||||
|
f"/api/documents/{document_id}/fields",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert "fields" in result
|
||||||
|
assert len(result["fields"]) == 6 # name, email, phone, country, birth_date, agree_terms
|
||||||
|
|
||||||
|
# Check field types
|
||||||
|
field_types = {f["field"]: f["type"] for f in result["fields"]}
|
||||||
|
assert field_types["name"] == "string"
|
||||||
|
assert field_types["email"] == "string"
|
||||||
|
assert field_types["phone"] == "string"
|
||||||
|
assert field_types["country"] == "select"
|
||||||
|
assert field_types["birth_date"] == "date"
|
||||||
|
assert field_types["agree_terms"] == "boolean"
|
||||||
|
|
||||||
|
def test_get_pdf_fields_complex_form(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test getting PDF fields from complex form."""
|
||||||
|
# First upload the document
|
||||||
|
with open(test_pdf_files["complex_form"], "rb") as f:
|
||||||
|
files = {"file": ("complex_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
upload_response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
if upload_response.status_code == 201:
|
||||||
|
document_id = upload_response.json()["document_id"]
|
||||||
|
|
||||||
|
# Get fields
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
response = test_client.get(
|
||||||
|
f"/api/documents/{document_id}/fields",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert "fields" in result
|
||||||
|
assert len(result["fields"]) == 16 # All fields from complex form
|
||||||
|
|
||||||
|
def test_get_pdf_fields_no_form_returns_empty_list(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test getting PDF fields from PDF without form fields."""
|
||||||
|
# First upload the document
|
||||||
|
with open(test_pdf_files["no_form"], "rb") as f:
|
||||||
|
files = {"file": ("no_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
upload_response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
if upload_response.status_code == 201:
|
||||||
|
document_id = upload_response.json()["document_id"]
|
||||||
|
|
||||||
|
# Get fields
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
response = test_client.get(
|
||||||
|
f"/api/documents/{document_id}/fields",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
assert "fields" in result
|
||||||
|
assert len(result["fields"]) == 0
|
||||||
|
|
||||||
|
def test_get_pdf_fields_without_auth_returns_401(self, test_client):
|
||||||
|
"""Test getting PDF fields without auth returns 401."""
|
||||||
|
response = test_client.get("/api/documents/test-doc-456/fields")
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentDeletion:
|
||||||
|
"""Test document deletion functionality."""
|
||||||
|
|
||||||
|
def test_delete_document_success(self, test_client, sample_auth_token):
|
||||||
|
"""Test deleting document successfully."""
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
response = test_client.delete(
|
||||||
|
"/api/documents/test-doc-456",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||||
|
assert response.status_code in [404, 403]
|
||||||
|
|
||||||
|
def test_delete_document_without_auth_returns_401(self, test_client):
|
||||||
|
"""Test deleting document without auth returns 401."""
|
||||||
|
response = test_client.delete("/api/documents/test-doc-456")
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
class TestPDFFieldDiscoveryDirect:
|
||||||
|
"""Test PDF field discovery directly (without API)."""
|
||||||
|
|
||||||
|
def test_simple_form_pdf_fields(self, test_pdf_files):
|
||||||
|
"""Test field discovery on simple form PDF."""
|
||||||
|
fields = discover_fields(test_pdf_files["simple_form"])
|
||||||
|
|
||||||
|
assert len(fields) == 6
|
||||||
|
field_names = [f["field"] for f in fields]
|
||||||
|
assert "name" in field_names
|
||||||
|
assert "email" in field_names
|
||||||
|
assert "phone" in field_names
|
||||||
|
assert "country" in field_names
|
||||||
|
assert "birth_date" in field_names
|
||||||
|
assert "agree_terms" in field_names
|
||||||
|
|
||||||
|
# Check field types
|
||||||
|
field_types = {f["field"]: f["type"] for f in fields}
|
||||||
|
assert field_types["name"] == "string"
|
||||||
|
assert field_types["email"] == "string"
|
||||||
|
assert field_types["phone"] == "string"
|
||||||
|
assert field_types["country"] == "select"
|
||||||
|
assert field_types["birth_date"] == "date"
|
||||||
|
assert field_types["agree_terms"] == "boolean"
|
||||||
|
|
||||||
|
def test_complex_form_pdf_fields(self, test_pdf_files):
|
||||||
|
"""Test field discovery on complex form PDF."""
|
||||||
|
fields = discover_fields(test_pdf_files["complex_form"])
|
||||||
|
|
||||||
|
assert len(fields) == 16
|
||||||
|
field_names = [f["field"] for f in fields]
|
||||||
|
|
||||||
|
# Check for expected fields
|
||||||
|
assert "first_name" in field_names
|
||||||
|
assert "last_name" in field_names
|
||||||
|
assert "email" in field_names
|
||||||
|
assert "country" in field_names
|
||||||
|
assert "gender" in field_names
|
||||||
|
assert "agree_terms" in field_names
|
||||||
|
assert "signature" in field_names
|
||||||
|
|
||||||
|
# Check field types
|
||||||
|
field_types = {f["field"]: f["type"] for f in fields}
|
||||||
|
assert field_types["first_name"] == "string"
|
||||||
|
assert field_types["country"] == "select"
|
||||||
|
assert field_types["gender"] == "boolean"
|
||||||
|
assert field_types["agree_terms"] == "boolean"
|
||||||
|
assert field_types["signature"] == "string"
|
||||||
|
|
||||||
|
def test_no_form_pdf_fields(self, test_pdf_files):
|
||||||
|
"""Test field discovery on PDF without form fields."""
|
||||||
|
fields = discover_fields(test_pdf_files["no_form"])
|
||||||
|
|
||||||
|
assert len(fields) == 0
|
||||||
|
|
||||||
|
def test_large_form_pdf_fields(self, test_pdf_files):
|
||||||
|
"""Test field discovery on large PDF without form fields."""
|
||||||
|
fields = discover_fields(test_pdf_files["large_form"])
|
||||||
|
|
||||||
|
assert len(fields) == 0
|
||||||
|
|
||||||
|
def test_pdf_field_labels_generated_correctly(self, test_pdf_files):
|
||||||
|
"""Test that field labels are generated correctly."""
|
||||||
|
fields = discover_fields(test_pdf_files["simple_form"])
|
||||||
|
|
||||||
|
field_labels = {f["field"]: f["label"] for f in fields}
|
||||||
|
assert field_labels["name"] == "Name"
|
||||||
|
assert field_labels["email"] == "Email"
|
||||||
|
assert field_labels["phone"] == "Phone"
|
||||||
|
assert field_labels["country"] == "Country"
|
||||||
|
assert field_labels["birth_date"] == "Birth Date"
|
||||||
|
assert field_labels["agree_terms"] == "Agree Terms"
|
||||||
|
|
||||||
|
def test_pdf_field_options_extracted_correctly(self, test_pdf_files):
|
||||||
|
"""Test that dropdown options are extracted correctly."""
|
||||||
|
fields = discover_fields(test_pdf_files["simple_form"])
|
||||||
|
|
||||||
|
country_field = next(f for f in fields if f["field"] == "country")
|
||||||
|
assert country_field["type"] == "select"
|
||||||
|
assert country_field["options"] is not None
|
||||||
|
assert len(country_field["options"]) == 5
|
||||||
|
assert "USA" in country_field["options"]
|
||||||
|
assert "Canada" in country_field["options"]
|
||||||
|
assert "UK" in country_field["options"]
|
||||||
|
assert "Germany" in country_field["options"]
|
||||||
|
assert "France" in country_field["options"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestCompleteWorkflow:
|
||||||
|
"""Test complete document lifecycle workflows."""
|
||||||
|
|
||||||
|
def test_complete_document_lifecycle(self, test_client, test_pdf_files, sample_auth_token):
|
||||||
|
"""Test complete document lifecycle: upload, get metadata, get fields, delete."""
|
||||||
|
# Upload document
|
||||||
|
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||||
|
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||||
|
data = {"org_id": "test-org-123"}
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
|
||||||
|
upload_response = test_client.post(
|
||||||
|
"/api/documents/upload",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
if upload_response.status_code == 201:
|
||||||
|
document_id = upload_response.json()["document_id"]
|
||||||
|
|
||||||
|
# Get metadata
|
||||||
|
headers = {"Authorization": sample_auth_token}
|
||||||
|
metadata_response = test_client.get(
|
||||||
|
f"/api/documents/{document_id}",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get fields
|
||||||
|
fields_response = test_client.get(
|
||||||
|
f"/api/documents/{document_id}/fields",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get download URL
|
||||||
|
download_response = test_client.get(
|
||||||
|
f"/api/documents/{document_id}/download-url",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Delete document
|
||||||
|
delete_response = test_client.delete(
|
||||||
|
f"/api/documents/{document_id}",
|
||||||
|
params={"org_id": "test-org-123"},
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify all operations succeeded
|
||||||
|
assert upload_response.status_code == 201
|
||||||
|
assert metadata_response.status_code in [200, 404] # May be 404 if S3 not available
|
||||||
|
assert fields_response.status_code in [200, 404]
|
||||||
|
assert download_response.status_code in [200, 404]
|
||||||
|
assert delete_response.status_code in [200, 404]
|
||||||
Reference in New Issue
Block a user