Initial commit of document-service
This commit is contained in:
464
tests/test_documents.py
Normal file
464
tests/test_documents.py
Normal file
@@ -0,0 +1,464 @@
|
||||
"""
|
||||
Comprehensive test suite for document-service.
|
||||
|
||||
Tests document upload, retrieval, field discovery, and complete workflows.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from fastapi.testclient import TestClient
|
||||
from unittest.mock import Mock, patch
|
||||
from moto import mock_aws
|
||||
import boto3
|
||||
|
||||
from app.main import app
|
||||
from app.pdf import discover_fields
|
||||
|
||||
# Test data paths
|
||||
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
|
||||
|
||||
|
||||
class TestHealthEndpoint:
|
||||
"""Test health endpoint functionality."""
|
||||
|
||||
def test_health_endpoint(self, test_client):
|
||||
"""Test health endpoint returns 200 OK."""
|
||||
response = test_client.get("/health")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {"status": "ok"}
|
||||
|
||||
|
||||
class TestDocumentUpload:
|
||||
"""Test document upload functionality."""
|
||||
|
||||
def test_upload_simple_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test uploading a simple PDF with form fields."""
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
result = response.json()
|
||||
assert "document_id" in result
|
||||
assert "metadata" in result
|
||||
assert "download_url" in result
|
||||
assert result["metadata"]["document_type"] == "pdf"
|
||||
assert result["metadata"]["filename"] == "simple_form.pdf"
|
||||
|
||||
def test_upload_complex_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test uploading a complex PDF with multiple field types."""
|
||||
with open(test_pdf_files["complex_form"], "rb") as f:
|
||||
files = {"file": ("complex_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
result = response.json()
|
||||
assert "document_id" in result
|
||||
assert result["metadata"]["document_type"] == "pdf"
|
||||
|
||||
def test_upload_no_form_pdf_success(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test uploading a PDF without form fields."""
|
||||
with open(test_pdf_files["no_form"], "rb") as f:
|
||||
files = {"file": ("no_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 201
|
||||
result = response.json()
|
||||
assert "document_id" in result
|
||||
|
||||
def test_upload_without_auth_returns_401(self, test_client, test_pdf_files):
|
||||
"""Test upload without auth returns 401."""
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data
|
||||
)
|
||||
|
||||
assert response.status_code == 401
|
||||
assert "detail" in response.json()
|
||||
|
||||
def test_upload_with_invalid_auth_returns_401(self, test_client, test_pdf_files):
|
||||
"""Test upload with invalid auth returns 401."""
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": "Invalid token"}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
def test_upload_missing_file_returns_400(self, test_client, sample_auth_token):
|
||||
"""Test upload without file returns 400."""
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 422 # FastAPI validation error
|
||||
|
||||
|
||||
class TestDocumentMetadata:
|
||||
"""Test document metadata retrieval."""
|
||||
|
||||
def test_get_document_metadata_success(self, test_client, sample_auth_token):
|
||||
"""Test getting document metadata successfully."""
|
||||
# This test would require a document to be uploaded first
|
||||
# For now, we'll test the endpoint structure
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.get(
|
||||
"/api/documents/test-doc-456",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||
assert response.status_code in [404, 403]
|
||||
|
||||
def test_get_document_without_auth_returns_401(self, test_client):
|
||||
"""Test getting document without auth returns 401."""
|
||||
response = test_client.get("/api/documents/test-doc-456")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestDownloadUrl:
|
||||
"""Test download URL generation."""
|
||||
|
||||
def test_get_download_url_success(self, test_client, sample_auth_token):
|
||||
"""Test getting download URL successfully."""
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.get(
|
||||
"/api/documents/test-doc-456/download-url",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||
assert response.status_code in [404, 403]
|
||||
|
||||
def test_get_download_url_without_auth_returns_401(self, test_client):
|
||||
"""Test getting download URL without auth returns 401."""
|
||||
response = test_client.get("/api/documents/test-doc-456/download-url")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestPDFFieldDiscovery:
|
||||
"""Test PDF field discovery functionality."""
|
||||
|
||||
def test_get_pdf_fields_simple_form(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test getting PDF fields from simple form."""
|
||||
# First upload the document
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get fields
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
assert "fields" in result
|
||||
assert len(result["fields"]) == 6 # name, email, phone, country, birth_date, agree_terms
|
||||
|
||||
# Check field types
|
||||
field_types = {f["field"]: f["type"] for f in result["fields"]}
|
||||
assert field_types["name"] == "string"
|
||||
assert field_types["email"] == "string"
|
||||
assert field_types["phone"] == "string"
|
||||
assert field_types["country"] == "select"
|
||||
assert field_types["birth_date"] == "date"
|
||||
assert field_types["agree_terms"] == "boolean"
|
||||
|
||||
def test_get_pdf_fields_complex_form(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test getting PDF fields from complex form."""
|
||||
# First upload the document
|
||||
with open(test_pdf_files["complex_form"], "rb") as f:
|
||||
files = {"file": ("complex_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get fields
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
assert "fields" in result
|
||||
assert len(result["fields"]) == 16 # All fields from complex form
|
||||
|
||||
def test_get_pdf_fields_no_form_returns_empty_list(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test getting PDF fields from PDF without form fields."""
|
||||
# First upload the document
|
||||
with open(test_pdf_files["no_form"], "rb") as f:
|
||||
files = {"file": ("no_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get fields
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
assert "fields" in result
|
||||
assert len(result["fields"]) == 0
|
||||
|
||||
def test_get_pdf_fields_without_auth_returns_401(self, test_client):
|
||||
"""Test getting PDF fields without auth returns 401."""
|
||||
response = test_client.get("/api/documents/test-doc-456/fields")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestDocumentDeletion:
|
||||
"""Test document deletion functionality."""
|
||||
|
||||
def test_delete_document_success(self, test_client, sample_auth_token):
|
||||
"""Test deleting document successfully."""
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
response = test_client.delete(
|
||||
"/api/documents/test-doc-456",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Will return 404 since document doesn't exist, but endpoint is accessible
|
||||
assert response.status_code in [404, 403]
|
||||
|
||||
def test_delete_document_without_auth_returns_401(self, test_client):
|
||||
"""Test deleting document without auth returns 401."""
|
||||
response = test_client.delete("/api/documents/test-doc-456")
|
||||
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestPDFFieldDiscoveryDirect:
|
||||
"""Test PDF field discovery directly (without API)."""
|
||||
|
||||
def test_simple_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on simple form PDF."""
|
||||
fields = discover_fields(test_pdf_files["simple_form"])
|
||||
|
||||
assert len(fields) == 6
|
||||
field_names = [f["field"] for f in fields]
|
||||
assert "name" in field_names
|
||||
assert "email" in field_names
|
||||
assert "phone" in field_names
|
||||
assert "country" in field_names
|
||||
assert "birth_date" in field_names
|
||||
assert "agree_terms" in field_names
|
||||
|
||||
# Check field types
|
||||
field_types = {f["field"]: f["type"] for f in fields}
|
||||
assert field_types["name"] == "string"
|
||||
assert field_types["email"] == "string"
|
||||
assert field_types["phone"] == "string"
|
||||
assert field_types["country"] == "select"
|
||||
assert field_types["birth_date"] == "date"
|
||||
assert field_types["agree_terms"] == "boolean"
|
||||
|
||||
def test_complex_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on complex form PDF."""
|
||||
fields = discover_fields(test_pdf_files["complex_form"])
|
||||
|
||||
assert len(fields) == 16
|
||||
field_names = [f["field"] for f in fields]
|
||||
|
||||
# Check for expected fields
|
||||
assert "first_name" in field_names
|
||||
assert "last_name" in field_names
|
||||
assert "email" in field_names
|
||||
assert "country" in field_names
|
||||
assert "gender" in field_names
|
||||
assert "agree_terms" in field_names
|
||||
assert "signature" in field_names
|
||||
|
||||
# Check field types
|
||||
field_types = {f["field"]: f["type"] for f in fields}
|
||||
assert field_types["first_name"] == "string"
|
||||
assert field_types["country"] == "select"
|
||||
assert field_types["gender"] == "boolean"
|
||||
assert field_types["agree_terms"] == "boolean"
|
||||
assert field_types["signature"] == "string"
|
||||
|
||||
def test_no_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on PDF without form fields."""
|
||||
fields = discover_fields(test_pdf_files["no_form"])
|
||||
|
||||
assert len(fields) == 0
|
||||
|
||||
def test_large_form_pdf_fields(self, test_pdf_files):
|
||||
"""Test field discovery on large PDF without form fields."""
|
||||
fields = discover_fields(test_pdf_files["large_form"])
|
||||
|
||||
assert len(fields) == 0
|
||||
|
||||
def test_pdf_field_labels_generated_correctly(self, test_pdf_files):
|
||||
"""Test that field labels are generated correctly."""
|
||||
fields = discover_fields(test_pdf_files["simple_form"])
|
||||
|
||||
field_labels = {f["field"]: f["label"] for f in fields}
|
||||
assert field_labels["name"] == "Name"
|
||||
assert field_labels["email"] == "Email"
|
||||
assert field_labels["phone"] == "Phone"
|
||||
assert field_labels["country"] == "Country"
|
||||
assert field_labels["birth_date"] == "Birth Date"
|
||||
assert field_labels["agree_terms"] == "Agree Terms"
|
||||
|
||||
def test_pdf_field_options_extracted_correctly(self, test_pdf_files):
|
||||
"""Test that dropdown options are extracted correctly."""
|
||||
fields = discover_fields(test_pdf_files["simple_form"])
|
||||
|
||||
country_field = next(f for f in fields if f["field"] == "country")
|
||||
assert country_field["type"] == "select"
|
||||
assert country_field["options"] is not None
|
||||
assert len(country_field["options"]) == 5
|
||||
assert "USA" in country_field["options"]
|
||||
assert "Canada" in country_field["options"]
|
||||
assert "UK" in country_field["options"]
|
||||
assert "Germany" in country_field["options"]
|
||||
assert "France" in country_field["options"]
|
||||
|
||||
|
||||
class TestCompleteWorkflow:
|
||||
"""Test complete document lifecycle workflows."""
|
||||
|
||||
def test_complete_document_lifecycle(self, test_client, test_pdf_files, sample_auth_token):
|
||||
"""Test complete document lifecycle: upload, get metadata, get fields, delete."""
|
||||
# Upload document
|
||||
with open(test_pdf_files["simple_form"], "rb") as f:
|
||||
files = {"file": ("simple_form.pdf", f, "application/pdf")}
|
||||
data = {"org_id": "test-org-123"}
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
|
||||
upload_response = test_client.post(
|
||||
"/api/documents/upload",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if upload_response.status_code == 201:
|
||||
document_id = upload_response.json()["document_id"]
|
||||
|
||||
# Get metadata
|
||||
headers = {"Authorization": sample_auth_token}
|
||||
metadata_response = test_client.get(
|
||||
f"/api/documents/{document_id}",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Get fields
|
||||
fields_response = test_client.get(
|
||||
f"/api/documents/{document_id}/fields",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Get download URL
|
||||
download_response = test_client.get(
|
||||
f"/api/documents/{document_id}/download-url",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Delete document
|
||||
delete_response = test_client.delete(
|
||||
f"/api/documents/{document_id}",
|
||||
params={"org_id": "test-org-123"},
|
||||
headers=headers
|
||||
)
|
||||
|
||||
# Verify all operations succeeded
|
||||
assert upload_response.status_code == 201
|
||||
assert metadata_response.status_code in [200, 404] # May be 404 if S3 not available
|
||||
assert fields_response.status_code in [200, 404]
|
||||
assert download_response.status_code in [200, 404]
|
||||
assert delete_response.status_code in [200, 404]
|
||||
Reference in New Issue
Block a user