Pydantic Is All You Need - Jason Liu#
from datetime import datetime
from typing import Any, Dict, List
from pydantic import BaseModel, Field, ValidationError, ValidationInfo, field_validator
from rich.pretty import pprint
from typing_extensions import Self
class User(BaseModel):
id: int = Field(..., description="The user id", examples=[1, 2, 3])
name: str = Field(..., min_length=2, max_length=50)
email: str = Field(..., description="Email of the user")
birth_date: datetime
is_active: bool = True
pprint(User.model_json_schema())
pprint(User.model_fields)
{ │ 'properties': { │ │ 'id': {'description': 'The user id', 'examples': [1, 2, 3], 'title': 'Id', 'type': 'integer'}, │ │ 'name': {'maxLength': 50, 'minLength': 2, 'title': 'Name', 'type': 'string'}, │ │ 'email': {'description': 'Email of the user', 'title': 'Email', 'type': 'string'}, │ │ 'birth_date': {'format': 'date-time', 'title': 'Birth Date', 'type': 'string'}, │ │ 'is_active': {'default': True, 'title': 'Is Active', 'type': 'boolean'} │ }, │ 'required': ['id', 'name', 'email', 'birth_date'], │ 'title': 'User', │ 'type': 'object' }
{ │ 'id': FieldInfo(annotation=int, required=True, description='The user id', examples=[1, 2, 3]), │ 'name': FieldInfo(annotation=str, required=True, metadata=[MinLen(min_length=2), MaxLen(max_length=50)]), │ 'email': FieldInfo(annotation=str, required=True, description='Email of the user'), │ 'birth_date': FieldInfo(annotation=datetime, required=True), │ 'is_active': FieldInfo(annotation=bool, required=False, default=True) }
class Users(BaseModel):
random_attribute: Dict[str, List[int]] = Field(..., description="A random attribute.")
users: list[User]
pprint(Users.model_json_schema())
pprint(Users.model_fields)
{ │ '$defs': { │ │ 'User': { │ │ │ 'properties': { │ │ │ │ 'id': {'description': 'The user id', 'examples': [1, 2, 3], 'title': 'Id', 'type': 'integer'}, │ │ │ │ 'name': {'maxLength': 50, 'minLength': 2, 'title': 'Name', 'type': 'string'}, │ │ │ │ 'email': {'description': 'Email of the user', 'title': 'Email', 'type': 'string'}, │ │ │ │ 'birth_date': {'format': 'date-time', 'title': 'Birth Date', 'type': 'string'}, │ │ │ │ 'is_active': {'default': True, 'title': 'Is Active', 'type': 'boolean'} │ │ │ }, │ │ │ 'required': ['id', 'name', 'email', 'birth_date'], │ │ │ 'title': 'User', │ │ │ 'type': 'object' │ │ } │ }, │ 'properties': { │ │ 'random_attribute': { │ │ │ 'additionalProperties': {'items': {'type': 'integer'}, 'type': 'array'}, │ │ │ 'description': 'A random attribute.', │ │ │ 'title': 'Random Attribute', │ │ │ 'type': 'object' │ │ }, │ │ 'users': {'items': {'$ref': '#/$defs/User'}, 'title': 'Users', 'type': 'array'} │ }, │ 'required': ['random_attribute', 'users'], │ 'title': 'Users', │ 'type': 'object' }
{ │ 'random_attribute': FieldInfo( │ │ annotation=Dict[str, List[int]], │ │ required=True, │ │ description='A random attribute.' │ ), │ 'users': FieldInfo(annotation=list[User], required=True) }
Below is a showcase of how pydantic coerce, parse and validate user inputs.
try:
user = User(
id="123", # String input but coerced to int
name="Alice", # String input with correct length
email="alice@example.com", # String input
birth_date="1990-01-01T00:00:00", # String input but parsed to datetime
is_active="yes" # String input but coerced to bool
)
pprint(user)
user_all_input_types_correct = User(
id=123,
name="Alice",
email="alice@example.com",
birth_date=datetime(1990, 1, 1),
is_active=True
)
pprint(user_all_input_types_correct)
assert user == user_all_input_types_correct
except ValidationError as exc:
print("Validation error:\n")
pprint(exc)
User( │ id=123, │ name='Alice', │ email='alice@example.com', │ birth_date=datetime.datetime(1990, 1, 1, 0, 0), │ is_active=True )
User( │ id=123, │ name='Alice', │ email='alice@example.com', │ birth_date=datetime.datetime(1990, 1, 1, 0, 0), │ is_active=True )
Below is a failed case where the parsing and validation fails, so it shows you that actual type checking and data validation is taking place.
try:
user = User(
id="abc", # Can't be parsed to int
name=[1, 2, 3],
email="not_an_email",
birth_date="invalid_date",
is_active=None
)
pprint(user)
except ValidationError as exc:
print("Validation error:\n")
pprint(exc)
Validation error:
4 validation errors for User id Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='abc', input_type=str] │ For further information visit https://errors.pydantic.dev/2.5/v/int_parsing name Input should be a valid string [type=string_type, input_value=[1, 2, 3], input_type=list] │ For further information visit https://errors.pydantic.dev/2.5/v/string_type birth_date Input should be a valid datetime, invalid character in year [type=datetime_parsing, input_value='invalid_date', input_type=str] │ For further information visit https://errors.pydantic.dev/2.5/v/datetime_parsing is_active Input should be a valid boolean [type=bool_type, input_value=None, input_type=NoneType] │ For further information visit https://errors.pydantic.dev/2.5/v/bool_type
Field Validators#
Before#
class ComplexUser(BaseModel):
id: int
name: str
code: str
status: str
Consider that your company the id
all starts with the prefix ID-
with unique
integers following it. Then the internal parser cannot coerce the string
ID-12345
into an integer. Since the integers following ID-
is unique,
we can just add a field_validator
to extract the integer part and validate
it. We would want to use a before
field validator because we want our
custom validation/parsing/coerce logic to happen before the default
pydantic parsing logic.
try:
model = ComplexUser(id="ID-12345", name="Prefixed ID", code="CODE_456", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
1 validation error for ComplexUser id Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='ID-12345', input_type=str] │ For further information visit https://errors.pydantic.dev/2.5/v/int_parsing
To add the before
validator, we can use the field_validator
decorator.
class ComplexUser(BaseModel):
id: int
name: str
code: str
status: str
@field_validator('id', mode='before')
@classmethod
def preprocess_id(cls: Self, v: Any) -> int:
if isinstance(v, str) and v.startswith('ID-'):
print(f"Preprocessing ID: {v}")
return int(v[3:])
return v
try:
model = ComplexUser(id="ID-12345", name="John Doe", code="CODE_456", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
Preprocessing ID: ID-12345
ComplexUser(id=12345, name='John Doe', code='CODE_456', status='inactive')
So we see that when the default pydantic parsing may fail, we can add
before
field validators to handle the parsing and validation of the raw
input data first, before the default pydantic parsing logic takes over.
After#
In a similar vein, we can also add after
field validators to handle the
parsed data after the default pydantic parsing logic has taken place.
The after
field validator is useful for post-processing or additional
validation on parsed data. Due to the nature of the after
field validator,
the parsed data is guaranteed to be of the correct type and is up to you
to post-process it.
Consider the case where you want to capitalize the name
field after it
has been parsed. We will use .title()
because we want to capitalize the
first letter of each word in the string and not just the first letter of
the entire string.
class ComplexUser(BaseModel):
id: int
name: str
code: str
status: str
@field_validator('id', mode='before')
@classmethod
def preprocess_id(cls: Self, v: Any) -> int:
if isinstance(v, str) and v.startswith('ID-'):
print(f"Preprocessing ID: {v}")
return int(v[3:])
return v
@field_validator('name', mode="after")
@classmethod
def capitalize_name(cls: Self, v: str) -> str:
print(f"Capitalizing name: {v}")
return v.title()
try:
model = ComplexUser(id="ID-12345", name="john doe", code="CODE_456", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
ComplexUser(id=12345, name='John Doe', code='CODE_456', status='inactive')
We see that when user input a string that is all low caps john doe
, the
after
field validator will capitalize the first letter of each word in
the string.
However, since it happens after the validation internally, we can actually do
naughty things like changing the value of the field to something else. For example,
no one is stopping me from just returning a list of integers in the after
field
capitalize_name
validator.
class ComplexUser(BaseModel):
id: int
name: str
code: str
status: str
@field_validator('id', mode='before')
@classmethod
def preprocess_id(cls: Self, v: Any) -> int:
if isinstance(v, str) and v.startswith('ID-'):
print(f"Preprocessing ID: {v}")
return int(v[3:])
return v
@field_validator('name', mode="after")
@classmethod
def capitalize_name(cls: Self, v: str) -> str:
print(f"Capitalizing name: {v}")
return [1,2,3]
try:
model = ComplexUser(id="ID-12345", name="john doe", code="CODE_456", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
ComplexUser(id=12345, name=[1, 2, 3], code='CODE_456', status='inactive')
And the code still runs without any errors. So be careful when using after
field
validators as it can be used to change the value of the field to something else.
Plain#
Completely replaces Pydantic’s internal validation and is responsible for all type checking and validation. No other validators are called after this and this is useful when you need full control over validation logic.
class ComplexUser(BaseModel):
id: int
name: str
code: str
status: str
@field_validator('id', mode='before')
@classmethod
def preprocess_id(cls: Self, v: Any) -> int:
if isinstance(v, str) and v.startswith('ID-'):
print(f"Preprocessing ID: {v}")
return int(v[3:])
return v
@field_validator('name', mode="after")
@classmethod
def capitalize_name(cls: Self, v: str) -> str:
print(f"Capitalizing name: {v}")
return [1,2,3]
@field_validator('code', mode='plain')
@classmethod
def validate_code(cls: Self, v: Any) -> str:
if not isinstance(v, str) or not v.startswith('CODE_'):
raise ValueError("Code must be a string starting with 'CODE_'")
return v
try:
model = ComplexUser(id="ID-12345", name="john doe", code="AAA", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
try:
model = ComplexUser(id="ID-12345", name="john doe", code="CODE_AAA", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
1 validation error for ComplexUser code Value error, Code must be a string starting with 'CODE_' [type=value_error, input_value='AAA', input_type=str] │ For further information visit https://errors.pydantic.dev/2.5/v/value_error
Preprocessing ID: ID-12345
Capitalizing name: john doe
ComplexUser(id=12345, name=[1, 2, 3], code='CODE_AAA', status='inactive')
Wrap#
See discussion here
to get a glimpse of how to use wrap
validator.
Can run code before and after Pydantic’s internal validation
Receives a handler function to call the inner validator
Can modify input before validation and output after validation
Can catch and handle validation errors from inner validators
class ComplexUser(BaseModel):
id: int
name: str
code: str
status: str
@field_validator('id', mode='before')
@classmethod
def preprocess_id(cls: Self, v: Any) -> int:
if isinstance(v, str) and v.startswith('ID-'):
print(f"Preprocessing ID: {v}")
return int(v[3:])
return v
@field_validator('name', mode="after")
@classmethod
def capitalize_name(cls: Self, v: str) -> str:
print(f"Capitalizing name: {v}")
return [1,2,3]
@field_validator('code', mode='plain')
@classmethod
def validate_code(cls: Self, v: Any) -> str:
if not isinstance(v, str) or not v.startswith('CODE_'):
raise ValueError("Code must be a string starting with 'CODE_'")
return v
@field_validator('status', mode='wrap')
@classmethod
def validate_status(cls, value: Any, handler: Any, info: ValidationInfo) -> str:
# pre-processing
if isinstance(value, str):
value = value.upper()
# inner validator
try:
validated = handler(value)
pprint(validated)
except ValueError as exc:
raise ValueError(f"Invalid status: {exc}") from exc
# post-processing
if validated not in ['ACTIVE', 'INACTIVE']:
raise ValueError("Status must be either 'ACTIVE' or 'INACTIVE'")
return validated
try:
model = ComplexUser(id="ID-12345", name="john doe", code="CODE_AAA", status="inactive")
pprint(model)
except ValidationError as exc:
pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
'INACTIVE'
ComplexUser(id=12345, name=[1, 2, 3], code='CODE_AAA', status='INACTIVE')
Dynamic Model Creation#
Pydantic allows you to dynamically create models at runtime using the
create_model
function. This is useful when you want to create a model based on
some configuration or input.
import functools
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, TypeVar, Union, get_args, get_origin
from pydantic import BaseModel, Field, create_model
from rich.pretty import pprint
from typing_extensions import ParamSpec
# https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
P = ParamSpec("P")
T = TypeVar("T")
def trace(func: Callable[P, T]) -> Callable[P, T]:
"""Decorator to log function calls."""
@functools.wraps(func) # This copies the metadata of `func` to `wrapper`
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
"""Me is wrapper."""
result = func(*args, **kwargs)
print(f"{func.__name__}({args!r}, {kwargs!r}) " f"-> {result!r}")
return result
return wrapper
Annotation = Union[Type[Any], None]
# @trace
def generate_dynamic_model(schema: Dict[str, Dict[str, Any]], model_name: str = "DynamicModel") -> Type[BaseModel]:
fields: Dict[str, Any] = {}
for field_name, field_info in schema.items():
field_type = field_info["type"]
field_type = resolve_field_type(field_type, field_info, field_name)
if field_info.get("optional", False):
field_type = Optional[field_type]
default_value = field_info.get("default", ...)
field_metadata = {
k: v for k, v in field_info.items() if k not in ["type", "default", "optional", "properties", "items"]
}
fields[field_name] = (
field_type,
Field(default=default_value, **field_metadata),
)
return create_model(model_name, **fields)
def resolve_field_type(field_type: Type[Any], field_info: Dict[str, Any], field_name: str) -> Union[Type[Any], Any]:
origin = get_origin(field_type)
args = get_args(field_type)
if field_type is object or (isinstance(field_type, type) and issubclass(field_type, dict)):
nested_schema = field_info.get("properties", {})
return generate_dynamic_model(schema=nested_schema, model_name=f"{field_name.title()}")
elif origin is list:
if not args:
return List[Any]
item_type = resolve_field_type(args[0], field_info.get("items", {}), f"{field_name}")
return List[item_type] # type: ignore[valid-type]
elif origin is dict:
key_type = args[0] if args else Any
value_type = resolve_field_type(
args[1] if len(args) > 1 else Any,
field_info.get("additionalProperties", {}),
f"{field_name}",
)
return Dict[key_type, value_type] # type: ignore[valid-type]
elif origin is Union:
resolved_args = tuple(resolve_field_type(arg, {}, f"{field_name}") for arg in args)
return Union[resolved_args]
return field_type
Schema 1. Basic Schema#
basic_schema = {
"a": {"type": int, "optional": True, "default": None},
"b": {"type": str, "optional": True, "default": None},
"c": {"type": List[str], "optional": False, "default": None},
"d": {"type": Dict[str, int], "optional": False, "default": None},
"e": {"type": Tuple[int, str, List[str]], "optional": False, "default": None},
"f": {"type": Union[int, str], "optional": False, "default": None},
"g": {"type": Union[int, None], "optional": False, "default": None},
"h": {"type": Any, "optional": False, "default": None},
"i": {"type": Literal["a", "b", 3], "optional": False, "default": None},
"j": {"type": Optional[int], "optional": True, "default": None},
}
DynamicModel = generate_dynamic_model(basic_schema, model_name="DynamicModel")
dynamic_model = DynamicModel(
a=1,
b="hello",
c=["world"],
d={"key": 1},
e=(1, "hello", ["world"]),
f=1,
g=100,
h=1.2,
i="a",
j=None,
)
pprint(dynamic_model)
pprint(dynamic_model.model_fields)
pprint(dynamic_model.model_json_schema())
pprint(dynamic_model.model_dump())
DynamicModel( │ a=1, │ b='hello', │ c=['world'], │ d={'key': 1}, │ e=(1, 'hello', ['world']), │ f=1, │ g=100, │ h=1.2, │ i='a', │ j=None )
{ │ 'a': FieldInfo(annotation=Union[int, NoneType], required=False), │ 'b': FieldInfo(annotation=Union[str, NoneType], required=False), │ 'c': FieldInfo(annotation=List[str], required=False), │ 'd': FieldInfo(annotation=Dict[str, int], required=False), │ 'e': FieldInfo(annotation=Tuple[int, str, List[str]], required=False), │ 'f': FieldInfo(annotation=Union[int, str], required=False), │ 'g': FieldInfo(annotation=Union[int, NoneType], required=False), │ 'h': FieldInfo(annotation=Any, required=False), │ 'i': FieldInfo(annotation=Literal['a', 'b', 3], required=False), │ 'j': FieldInfo(annotation=Union[int, NoneType], required=False) }
{ │ 'properties': { │ │ 'a': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'A'}, │ │ 'b': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'B'}, │ │ 'c': {'default': None, 'items': {'type': 'string'}, 'title': 'C', 'type': 'array'}, │ │ 'd': {'additionalProperties': {'type': 'integer'}, 'default': None, 'title': 'D', 'type': 'object'}, │ │ 'e': { │ │ │ 'default': None, │ │ │ 'maxItems': 3, │ │ │ 'minItems': 3, │ │ │ 'prefixItems': [ │ │ │ │ {'type': 'integer'}, │ │ │ │ {'type': 'string'}, │ │ │ │ {'items': {'type': 'string'}, 'type': 'array'} │ │ │ ], │ │ │ 'title': 'E', │ │ │ 'type': 'array' │ │ }, │ │ 'f': {'anyOf': [{'type': 'integer'}, {'type': 'string'}], 'default': None, 'title': 'F'}, │ │ 'g': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'G'}, │ │ 'h': {'default': None, 'title': 'H'}, │ │ 'i': {'default': None, 'enum': ['a', 'b', 3], 'title': 'I'}, │ │ 'j': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'J'} │ }, │ 'title': 'DynamicModel', │ 'type': 'object' }
{ │ 'a': 1, │ 'b': 'hello', │ 'c': ['world'], │ 'd': {'key': 1}, │ 'e': (1, 'hello', ['world']), │ 'f': 1, │ 'g': 100, │ 'h': 1.2, │ 'i': 'a', │ 'j': None }
Schema 2. User Schema With Nested Properties#
user_schema = {
"name": {"type": str, "description": "The user's full name"},
"age": {
"type": int,
"optional": False,
"default": None,
"description": "The user's age in years",
},
"address": {
"type": object,
"properties": {
"street": {"type": str, "description": "Street name and number"},
"city": {"type": str, "description": "City name"},
"postal_code": {"type": int, "description": "Postal code or ZIP code"},
},
"description": "The user's mailing address",
"optional": True,
},
"phone_numbers": {"type": List[str], "description": "List of user's phone numbers"},
"tags": {
"type": List[object],
"items": {
"properties": {
"name": {"type": str, "description": "Tag name"},
"value": {"type": int, "description": "Tag value"},
}
},
"description": "List of user tags",
},
"pet": {
"type": object,
"properties": {
"name": {"type": str, "description": "Pet name"},
"age": {"type": int, "description": "Pet age in years"},
},
"description": "User's pet",
},
}
User = generate_dynamic_model(user_schema, model_name="User")
user = User(
name="John Doe",
age=20,
address={"street": "123 Main St", "city": "Anytown", "postal_code": 12345},
phone_numbers=["1234567890", "0987654321"],
tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
pet={"name": "Fluffy", "age": 2},
)
pprint(user)
pprint(user.model_dump())
pprint(user.model_fields)
pprint(user.model_json_schema())
User( │ name='John Doe', │ age=20, │ address=Address(street='123 Main St', city='Anytown', postal_code=12345), │ phone_numbers=['1234567890', '0987654321'], │ tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)], │ pet=Pet(name='Fluffy', age=2) )
{ │ 'name': 'John Doe', │ 'age': 20, │ 'address': {'street': '123 Main St', 'city': 'Anytown', 'postal_code': 12345}, │ 'phone_numbers': ['1234567890', '0987654321'], │ 'tags': [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}], │ 'pet': {'name': 'Fluffy', 'age': 2} }
{ │ 'name': FieldInfo(annotation=str, required=True, description="The user's full name"), │ 'age': FieldInfo(annotation=int, required=False, description="The user's age in years"), │ 'address': FieldInfo( │ │ annotation=Union[Address, NoneType], │ │ required=True, │ │ description="The user's mailing address" │ ), │ 'phone_numbers': FieldInfo(annotation=List[str], required=True, description="List of user's phone numbers"), │ 'tags': FieldInfo(annotation=List[__main__.Tags], required=True, description='List of user tags'), │ 'pet': FieldInfo(annotation=Pet, required=True, description="User's pet") }
{ │ '$defs': { │ │ 'Address': { │ │ │ 'properties': { │ │ │ │ 'street': {'description': 'Street name and number', 'title': 'Street', 'type': 'string'}, │ │ │ │ 'city': {'description': 'City name', 'title': 'City', 'type': 'string'}, │ │ │ │ 'postal_code': { │ │ │ │ │ 'description': 'Postal code or ZIP code', │ │ │ │ │ 'title': 'Postal Code', │ │ │ │ │ 'type': 'integer' │ │ │ │ } │ │ │ }, │ │ │ 'required': ['street', 'city', 'postal_code'], │ │ │ 'title': 'Address', │ │ │ 'type': 'object' │ │ }, │ │ 'Pet': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Pet name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'age': {'description': 'Pet age in years', 'title': 'Age', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'age'], │ │ │ 'title': 'Pet', │ │ │ 'type': 'object' │ │ }, │ │ 'Tags': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'value'], │ │ │ 'title': 'Tags', │ │ │ 'type': 'object' │ │ } │ }, │ 'properties': { │ │ 'name': {'description': "The user's full name", 'title': 'Name', 'type': 'string'}, │ │ 'age': {'default': None, 'description': "The user's age in years", 'title': 'Age', 'type': 'integer'}, │ │ 'address': { │ │ │ 'anyOf': [{'$ref': '#/$defs/Address'}, {'type': 'null'}], │ │ │ 'description': "The user's mailing address" │ │ }, │ │ 'phone_numbers': { │ │ │ 'description': "List of user's phone numbers", │ │ │ 'items': {'type': 'string'}, │ │ │ 'title': 'Phone Numbers', │ │ │ 'type': 'array' │ │ }, │ │ 'tags': { │ │ │ 'description': 'List of user tags', │ │ │ 'items': {'$ref': '#/$defs/Tags'}, │ │ │ 'title': 'Tags', │ │ │ 'type': 'array' │ │ }, │ │ 'pet': {'allOf': [{'$ref': '#/$defs/Pet'}], 'description': "User's pet"} │ }, │ 'required': ['name', 'address', 'phone_numbers', 'tags', 'pet'], │ 'title': 'User', │ 'type': 'object' }
What We Did Not Handle?#
First, we note that we can handle nested lists such as List[List[object]]
where object
is a complex type that is not part of those basic types (i.e.
int
, str
, list
, dict
, tuple
). Here, we added tagss
to the schema
where we expect it to become List[List[Tagss]]
, which we handled below:
elif origin is list:
if not args:
return List[Any]
item_type = resolve_field_type(args[0], field_info.get("items", {}), f"{field_name}")
return List[item_type] # type: ignore[valid-type]
user_schema.update(
{
"tagss": {
"type": List[List[object]],
"optional": False,
"items": {
"type": List[object],
"items": {
"type": object,
"properties": {
"name": {"type": str, "description": "Tag name"},
"value": {"type": int, "description": "Tag value"},
},
},
},
},
}
)
User = generate_dynamic_model(user_schema, model_name="User")
user = User(
name="John Doe",
age=20,
address={"street": "123 Main St", "city": "Anytown", "postal_code": 12345},
phone_numbers=["1234567890", "0987654321"],
tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
pet={"name": "Fluffy", "age": 2},
tagss=[
[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
[{"name": "tag3", "value": 3}, {"name": "tag4", "value": 4}],
],
)
pprint(user)
pprint(user.model_dump())
pprint(user.model_fields)
pprint(user.model_json_schema())
User( │ name='John Doe', │ age=20, │ address=Address(street='123 Main St', city='Anytown', postal_code=12345), │ phone_numbers=['1234567890', '0987654321'], │ tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)], │ pet=Pet(name='Fluffy', age=2), │ tagss=[ │ │ [Tagss(name='tag1', value=1), Tagss(name='tag2', value=2)], │ │ [Tagss(name='tag3', value=3), Tagss(name='tag4', value=4)] │ ] )
{ │ 'name': 'John Doe', │ 'age': 20, │ 'address': {'street': '123 Main St', 'city': 'Anytown', 'postal_code': 12345}, │ 'phone_numbers': ['1234567890', '0987654321'], │ 'tags': [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}], │ 'pet': {'name': 'Fluffy', 'age': 2}, │ 'tagss': [ │ │ [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}], │ │ [{'name': 'tag3', 'value': 3}, {'name': 'tag4', 'value': 4}] │ ] }
{ │ 'name': FieldInfo(annotation=str, required=True, description="The user's full name"), │ 'age': FieldInfo(annotation=int, required=False, description="The user's age in years"), │ 'address': FieldInfo( │ │ annotation=Union[Address, NoneType], │ │ required=True, │ │ description="The user's mailing address" │ ), │ 'phone_numbers': FieldInfo(annotation=List[str], required=True, description="List of user's phone numbers"), │ 'tags': FieldInfo(annotation=List[__main__.Tags], required=True, description='List of user tags'), │ 'pet': FieldInfo(annotation=Pet, required=True, description="User's pet"), │ 'tagss': FieldInfo(annotation=List[List[__main__.Tagss]], required=True) }
{ │ '$defs': { │ │ 'Address': { │ │ │ 'properties': { │ │ │ │ 'street': {'description': 'Street name and number', 'title': 'Street', 'type': 'string'}, │ │ │ │ 'city': {'description': 'City name', 'title': 'City', 'type': 'string'}, │ │ │ │ 'postal_code': { │ │ │ │ │ 'description': 'Postal code or ZIP code', │ │ │ │ │ 'title': 'Postal Code', │ │ │ │ │ 'type': 'integer' │ │ │ │ } │ │ │ }, │ │ │ 'required': ['street', 'city', 'postal_code'], │ │ │ 'title': 'Address', │ │ │ 'type': 'object' │ │ }, │ │ 'Pet': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Pet name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'age': {'description': 'Pet age in years', 'title': 'Age', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'age'], │ │ │ 'title': 'Pet', │ │ │ 'type': 'object' │ │ }, │ │ 'Tags': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'value'], │ │ │ 'title': 'Tags', │ │ │ 'type': 'object' │ │ }, │ │ 'Tagss': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'value'], │ │ │ 'title': 'Tagss', │ │ │ 'type': 'object' │ │ } │ }, │ 'properties': { │ │ 'name': {'description': "The user's full name", 'title': 'Name', 'type': 'string'}, │ │ 'age': {'default': None, 'description': "The user's age in years", 'title': 'Age', 'type': 'integer'}, │ │ 'address': { │ │ │ 'anyOf': [{'$ref': '#/$defs/Address'}, {'type': 'null'}], │ │ │ 'description': "The user's mailing address" │ │ }, │ │ 'phone_numbers': { │ │ │ 'description': "List of user's phone numbers", │ │ │ 'items': {'type': 'string'}, │ │ │ 'title': 'Phone Numbers', │ │ │ 'type': 'array' │ │ }, │ │ 'tags': { │ │ │ 'description': 'List of user tags', │ │ │ 'items': {'$ref': '#/$defs/Tags'}, │ │ │ 'title': 'Tags', │ │ │ 'type': 'array' │ │ }, │ │ 'pet': {'allOf': [{'$ref': '#/$defs/Pet'}], 'description': "User's pet"}, │ │ 'tagss': { │ │ │ 'items': {'items': {'$ref': '#/$defs/Tagss'}, 'type': 'array'}, │ │ │ 'title': 'Tagss', │ │ │ 'type': 'array' │ │ } │ }, │ 'required': ['name', 'address', 'phone_numbers', 'tags', 'pet', 'tagss'], │ 'title': 'User', │ 'type': 'object' }
We can also define a model in python and pass it to the json/dict schema.
class Movie(BaseModel):
title: str
year: int
genre: List[str]
user_schema.update(
{
"movies": {
"type": List[Movie],
"optional": False,
"default": None,
}
}
)
User = generate_dynamic_model(user_schema, model_name="User")
user = User(
name="John Doe",
age=20,
address={"street": "123 Main St", "city": "Anytown", "postal_code": 12345},
phone_numbers=["1234567890", "0987654321"],
tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
pet={"name": "Fluffy", "age": 2},
tagss=[
[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
[{"name": "tag3", "value": 3}, {"name": "tag4", "value": 4}],
],
movies=[Movie(title="The Matrix", year=1999, genre=["Action", "Sci-Fi"]), Movie(title="The Matrix Reloaded", year=2003, genre=["Action", "Sci-Fi"])],
)
pprint(user)
pprint(user.model_dump())
pprint(user.model_fields)
pprint(user.model_json_schema())
User( │ name='John Doe', │ age=20, │ address=Address(street='123 Main St', city='Anytown', postal_code=12345), │ phone_numbers=['1234567890', '0987654321'], │ tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)], │ pet=Pet(name='Fluffy', age=2), │ tagss=[ │ │ [Tagss(name='tag1', value=1), Tagss(name='tag2', value=2)], │ │ [Tagss(name='tag3', value=3), Tagss(name='tag4', value=4)] │ ], │ movies=[ │ │ Movie(title='The Matrix', year=1999, genre=['Action', 'Sci-Fi']), │ │ Movie(title='The Matrix Reloaded', year=2003, genre=['Action', 'Sci-Fi']) │ ] )
{ │ 'name': 'John Doe', │ 'age': 20, │ 'address': {'street': '123 Main St', 'city': 'Anytown', 'postal_code': 12345}, │ 'phone_numbers': ['1234567890', '0987654321'], │ 'tags': [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}], │ 'pet': {'name': 'Fluffy', 'age': 2}, │ 'tagss': [ │ │ [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}], │ │ [{'name': 'tag3', 'value': 3}, {'name': 'tag4', 'value': 4}] │ ], │ 'movies': [ │ │ {'title': 'The Matrix', 'year': 1999, 'genre': ['Action', 'Sci-Fi']}, │ │ {'title': 'The Matrix Reloaded', 'year': 2003, 'genre': ['Action', 'Sci-Fi']} │ ] }
{ │ 'name': FieldInfo(annotation=str, required=True, description="The user's full name"), │ 'age': FieldInfo(annotation=int, required=False, description="The user's age in years"), │ 'address': FieldInfo( │ │ annotation=Union[Address, NoneType], │ │ required=True, │ │ description="The user's mailing address" │ ), │ 'phone_numbers': FieldInfo(annotation=List[str], required=True, description="List of user's phone numbers"), │ 'tags': FieldInfo(annotation=List[__main__.Tags], required=True, description='List of user tags'), │ 'pet': FieldInfo(annotation=Pet, required=True, description="User's pet"), │ 'tagss': FieldInfo(annotation=List[List[__main__.Tagss]], required=True), │ 'movies': FieldInfo(annotation=List[__main__.Movie], required=False) }
{ │ '$defs': { │ │ 'Address': { │ │ │ 'properties': { │ │ │ │ 'street': {'description': 'Street name and number', 'title': 'Street', 'type': 'string'}, │ │ │ │ 'city': {'description': 'City name', 'title': 'City', 'type': 'string'}, │ │ │ │ 'postal_code': { │ │ │ │ │ 'description': 'Postal code or ZIP code', │ │ │ │ │ 'title': 'Postal Code', │ │ │ │ │ 'type': 'integer' │ │ │ │ } │ │ │ }, │ │ │ 'required': ['street', 'city', 'postal_code'], │ │ │ 'title': 'Address', │ │ │ 'type': 'object' │ │ }, │ │ 'Movie': { │ │ │ 'properties': { │ │ │ │ 'title': {'title': 'Title', 'type': 'string'}, │ │ │ │ 'year': {'title': 'Year', 'type': 'integer'}, │ │ │ │ 'genre': {'items': {'type': 'string'}, 'title': 'Genre', 'type': 'array'} │ │ │ }, │ │ │ 'required': ['title', 'year', 'genre'], │ │ │ 'title': 'Movie', │ │ │ 'type': 'object' │ │ }, │ │ 'Pet': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Pet name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'age': {'description': 'Pet age in years', 'title': 'Age', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'age'], │ │ │ 'title': 'Pet', │ │ │ 'type': 'object' │ │ }, │ │ 'Tags': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'value'], │ │ │ 'title': 'Tags', │ │ │ 'type': 'object' │ │ }, │ │ 'Tagss': { │ │ │ 'properties': { │ │ │ │ 'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'}, │ │ │ │ 'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'} │ │ │ }, │ │ │ 'required': ['name', 'value'], │ │ │ 'title': 'Tagss', │ │ │ 'type': 'object' │ │ } │ }, │ 'properties': { │ │ 'name': {'description': "The user's full name", 'title': 'Name', 'type': 'string'}, │ │ 'age': {'default': None, 'description': "The user's age in years", 'title': 'Age', 'type': 'integer'}, │ │ 'address': { │ │ │ 'anyOf': [{'$ref': '#/$defs/Address'}, {'type': 'null'}], │ │ │ 'description': "The user's mailing address" │ │ }, │ │ 'phone_numbers': { │ │ │ 'description': "List of user's phone numbers", │ │ │ 'items': {'type': 'string'}, │ │ │ 'title': 'Phone Numbers', │ │ │ 'type': 'array' │ │ }, │ │ 'tags': { │ │ │ 'description': 'List of user tags', │ │ │ 'items': {'$ref': '#/$defs/Tags'}, │ │ │ 'title': 'Tags', │ │ │ 'type': 'array' │ │ }, │ │ 'pet': {'allOf': [{'$ref': '#/$defs/Pet'}], 'description': "User's pet"}, │ │ 'tagss': { │ │ │ 'items': {'items': {'$ref': '#/$defs/Tagss'}, 'type': 'array'}, │ │ │ 'title': 'Tagss', │ │ │ 'type': 'array' │ │ }, │ │ 'movies': {'default': None, 'items': {'$ref': '#/$defs/Movie'}, 'title': 'Movies', 'type': 'array'} │ }, │ 'required': ['name', 'address', 'phone_numbers', 'tags', 'pet', 'tagss'], │ 'title': 'User', │ 'type': 'object' }
What we have issue doing is the below, where we define a complex dictionary with say, the value as another complex object.
violated_schema = {
"tagsss": {
"type": Dict[str, object],
"optional": False,
"additionalProperties": {
"type": object,
"properties": {
"name": {"type": str, "description": "Tag name"},
"value": {"type": int, "description": "Tag value"},
},
},
},
}
Violated = generate_dynamic_model(violated_schema, model_name="Violated")
violated = Violated(tagsss={"key": {"name": "tag1", "value": 1}})
pprint(violated)
pprint(violated.model_dump())
pprint(violated.model_fields)
try:
pprint(violated.model_json_schema())
except Exception as exc:
pprint(exc)
Violated(tagsss={'key': Tagsss(name='tag1', value=1)})
{'tagsss': {'key': {'name': 'tag1', 'value': 1}}}
{ │ 'tagsss': FieldInfo( │ │ annotation=Dict[str, __main__.Tagsss], │ │ required=True, │ │ json_schema_extra={ │ │ │ 'additionalProperties': { │ │ │ │ 'type': <class 'object'>, │ │ │ │ 'properties': { │ │ │ │ │ 'name': {'type': <class 'str'>, 'description': 'Tag name'}, │ │ │ │ │ 'value': {'type': <class 'int'>, 'description': 'Tag value'} │ │ │ │ } │ │ │ } │ │ } │ ) }
PydanticSerializationError(Unable to serialize unknown type: <class 'type'>)
Why does the Pydantic model able to correctly infer the schema of the nested
dictionary with arbitrary nesting and still conform to the schema but unable to
serialize it? I did not dig deep but I suspect there’s some good amount of
coercion happening in the background. But then one big reason with our code is
our additionalProperties
is not being handled properly.
{
'tagsss': FieldInfo(
annotation=Dict[str, __main__.Tagsss],
required=True,
json_schema_extra={
'additionalProperties': {
'type': <class 'object'>,
'properties': {
'name': {'type': <class 'str'>, 'description': 'Tag name'},
'value': {'type': <class 'int'>, 'description': 'Tag value'}
}
}
}
)
}
Our type
should be a string, but it is a class with the types. So
<class 'str'>
should just be str
. For now, we do not handle this.
Moreover, there are many many scenario of nested complex objects that we did not handle. We do not reinvent the wheel as datamodel-code-generator does the job well enough. Note that creating model from a json schema has been requested, and the author of Pydantic personally suggested trying datamodel-code-generator as Pydantic does not support it yet.
What is get_origin and get_args?#
Consider you want to check the type like a = [1, 2, 3]
you would generally do
type(a)
and you get the type list
. However, now consider this, if you set
tp = List[str]
as a type alias assignment, then type(tp)
will still return
typing._GenericAlias
and in order to get the type list
you have to do
tp.__origin__
and tp.__args__
where the former is the origin of the type and
the latter is the arguments of the type, in which case it is (str,)
.
The get_origin
function does exactly this under the hood to get the origin of
the type. So get_origin(List[str])
is list
and get_args(List[str])
is
(str,)
.
type(List[str]), List[str].__origin__, List[str].__args__
(typing._GenericAlias, list, (str,))
get_origin(List[str]), get_args(List[str])
(list, (str,))
type(List[Dict[str, Any]]), List[Dict[str, Any]].__origin__, List[Dict[str, Any]].__args__
(typing._GenericAlias, list, (typing.Dict[str, typing.Any],))
get_origin(List[Dict[str, Any]]), get_args(List[Dict[str, Any]])
(list, (typing.Dict[str, typing.Any],))
Note that isinstance(get_origin(List[str]), list)
is False
because,
get_origin(List[str])
returns the origin of the List type, which islist
.isinstance(list, list)
is actuallyFalse
becauselist
is a type, not an instance oflist
.
get_origin(List[Dict[str, Any]]), type(List[Dict[str, Any]]), isinstance(get_origin(List[Dict[str, Any]]), list), isinstance(list, list)
(list, typing._GenericAlias, False, False)
get_origin(List["object"]), get_args(List["object"])
(list, (ForwardRef('object'),))
From Yaml#
Currently we define our schema in python, this is a very easy way because
we can define real types like List[str]
and Dict[str, int]
but in yaml
based config these will become string.
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
import yaml
def str_to_type(type_str: str) -> Any:
type_str = type_str.strip()
basic_types = {
"str": str,
"int": int,
"float": float,
"bool": bool,
"Any": Any,
"None": type(None),
"null": type(None),
"object": dict,
}
if type_str in basic_types:
return basic_types[type_str]
complex_types = {
"List": lambda t: List[str_to_type(t)], # type: ignore[misc]
"Dict": lambda t: Dict[str_to_type(t.split(",")[0]), str_to_type(t.split(",")[1])], # type: ignore[misc]
"Tuple": lambda t: Tuple[tuple(map(str_to_type, t.split(",")))],
"Union": lambda t: Union[tuple(map(str_to_type, t.split(",")))],
"Optional": lambda t: Optional[str_to_type(t)],
"Literal": lambda t: Literal[eval(t)],
}
for type_name, type_func in complex_types.items():
if type_str.startswith(f"{type_name}["):
inner_type = type_str[len(type_name) + 1 : -1]
return type_func(inner_type)
raise ValueError(f"Unknown type: {type_str}")
def process_schema(schema: Dict[str, Any]) -> Dict[str, Any]:
processed_schema = {}
for key, value in schema.items():
if isinstance(value, dict):
if "type" in value:
value["type"] = str_to_type(value["type"])
if "properties" in value:
value["properties"] = process_schema(value["properties"])
if "items" in value and "properties" in value["items"]:
value["items"]["properties"] = process_schema(value["items"]["properties"])
processed_schema[key] = value
return processed_schema
with open("./assets/schema.yaml", "r") as file:
yaml_schema = yaml.safe_load(file)
pprint(yaml_schema)
processed_schema = process_schema(yaml_schema)
pprint(processed_schema)
User = generate_dynamic_model(processed_schema, "User")
user = User(
name="John Doe",
age=30,
address={"street": "123 Main St", "city": "New York", "postal_code": 10001},
phone_numbers=["123-456-7890", "098-765-4321"],
tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
pet={"name": "Buddy", "age": 5, "species": "Dog"},
)
pprint(user)
{ │ 'name': {'type': 'str', 'description': "The user's full name"}, │ 'age': {'type': 'int', 'optional': False, 'default': None, 'description': "The user's age in years"}, │ 'address': { │ │ 'type': 'object', │ │ 'properties': { │ │ │ 'street': {'type': 'str', 'description': 'Street name and number'}, │ │ │ 'city': {'type': 'str', 'description': 'City name'}, │ │ │ 'postal_code': {'type': 'int', 'description': 'Postal code or ZIP code'} │ │ }, │ │ 'description': "The user's mailing address", │ │ 'optional': True │ }, │ 'phone_numbers': {'type': 'List[str]', 'description': "List of user's phone numbers"}, │ 'tags': { │ │ 'type': 'List[object]', │ │ 'items': { │ │ │ 'properties': { │ │ │ │ 'name': {'type': 'str', 'description': 'Tag name'}, │ │ │ │ 'value': {'type': 'int', 'description': 'Tag value'} │ │ │ } │ │ }, │ │ 'description': 'List of user tags' │ }, │ 'pet': { │ │ 'type': 'object', │ │ 'properties': { │ │ │ 'name': {'type': 'str', 'description': 'Pet name'}, │ │ │ 'age': {'type': 'int', 'description': 'Pet age in years'} │ │ }, │ │ 'description': "User's pet" │ } }
{ │ 'name': {'type': <class 'str'>, 'description': "The user's full name"}, │ 'age': {'type': <class 'int'>, 'optional': False, 'default': None, 'description': "The user's age in years"}, │ 'address': { │ │ 'type': <class 'dict'>, │ │ 'properties': { │ │ │ 'street': {'type': <class 'str'>, 'description': 'Street name and number'}, │ │ │ 'city': {'type': <class 'str'>, 'description': 'City name'}, │ │ │ 'postal_code': {'type': <class 'int'>, 'description': 'Postal code or ZIP code'} │ │ }, │ │ 'description': "The user's mailing address", │ │ 'optional': True │ }, │ 'phone_numbers': {'type': typing.List[str], 'description': "List of user's phone numbers"}, │ 'tags': { │ │ 'type': typing.List[dict], │ │ 'items': { │ │ │ 'properties': { │ │ │ │ 'name': {'type': <class 'str'>, 'description': 'Tag name'}, │ │ │ │ 'value': {'type': <class 'int'>, 'description': 'Tag value'} │ │ │ } │ │ }, │ │ 'description': 'List of user tags' │ }, │ 'pet': { │ │ 'type': <class 'dict'>, │ │ 'properties': { │ │ │ 'name': {'type': <class 'str'>, 'description': 'Pet name'}, │ │ │ 'age': {'type': <class 'int'>, 'description': 'Pet age in years'} │ │ }, │ │ 'description': "User's pet" │ } }
User( │ name='John Doe', │ age=30, │ address=Address(street='123 Main St', city='New York', postal_code=10001), │ phone_numbers=['123-456-7890', '098-765-4321'], │ tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)], │ pet=Pet(name='Buddy', age=5) )