Pydantic Is All You Need - Jason Liu#

Twitter Handle LinkedIn Profile GitHub Profile Tag

from datetime import datetime
from typing import Any, Dict, List

from pydantic import BaseModel, Field, ValidationError, ValidationInfo, field_validator
from rich.pretty import pprint
from typing_extensions import Self
class User(BaseModel):
    id: int = Field(..., description="The user id", examples=[1, 2, 3])
    name: str = Field(..., min_length=2, max_length=50)
    email: str = Field(..., description="Email of the user")
    birth_date: datetime
    is_active: bool = True
pprint(User.model_json_schema())
pprint(User.model_fields)
{
'properties': {
│   │   'id': {'description': 'The user id', 'examples': [1, 2, 3], 'title': 'Id', 'type': 'integer'},
│   │   'name': {'maxLength': 50, 'minLength': 2, 'title': 'Name', 'type': 'string'},
│   │   'email': {'description': 'Email of the user', 'title': 'Email', 'type': 'string'},
│   │   'birth_date': {'format': 'date-time', 'title': 'Birth Date', 'type': 'string'},
│   │   'is_active': {'default': True, 'title': 'Is Active', 'type': 'boolean'}
},
'required': ['id', 'name', 'email', 'birth_date'],
'title': 'User',
'type': 'object'
}
{
'id': FieldInfo(annotation=int, required=True, description='The user id', examples=[1, 2, 3]),
'name': FieldInfo(annotation=str, required=True, metadata=[MinLen(min_length=2), MaxLen(max_length=50)]),
'email': FieldInfo(annotation=str, required=True, description='Email of the user'),
'birth_date': FieldInfo(annotation=datetime, required=True),
'is_active': FieldInfo(annotation=bool, required=False, default=True)
}
class Users(BaseModel):
    random_attribute: Dict[str, List[int]] = Field(..., description="A random attribute.")
    users: list[User]
pprint(Users.model_json_schema())
pprint(Users.model_fields)
{
'$defs': {
│   │   'User': {
│   │   │   'properties': {
│   │   │   │   'id': {'description': 'The user id', 'examples': [1, 2, 3], 'title': 'Id', 'type': 'integer'},
│   │   │   │   'name': {'maxLength': 50, 'minLength': 2, 'title': 'Name', 'type': 'string'},
│   │   │   │   'email': {'description': 'Email of the user', 'title': 'Email', 'type': 'string'},
│   │   │   │   'birth_date': {'format': 'date-time', 'title': 'Birth Date', 'type': 'string'},
│   │   │   │   'is_active': {'default': True, 'title': 'Is Active', 'type': 'boolean'}
│   │   │   },
│   │   │   'required': ['id', 'name', 'email', 'birth_date'],
│   │   │   'title': 'User',
│   │   │   'type': 'object'
│   │   }
},
'properties': {
│   │   'random_attribute': {
│   │   │   'additionalProperties': {'items': {'type': 'integer'}, 'type': 'array'},
│   │   │   'description': 'A random attribute.',
│   │   │   'title': 'Random Attribute',
│   │   │   'type': 'object'
│   │   },
│   │   'users': {'items': {'$ref': '#/$defs/User'}, 'title': 'Users', 'type': 'array'}
},
'required': ['random_attribute', 'users'],
'title': 'Users',
'type': 'object'
}
{
'random_attribute': FieldInfo(
│   │   annotation=Dict[str, List[int]],
│   │   required=True,
│   │   description='A random attribute.'
),
'users': FieldInfo(annotation=list[User], required=True)
}

Below is a showcase of how pydantic coerce, parse and validate user inputs.

try:
    user = User(
        id="123",                           # String input but coerced to int
        name="Alice",                       # String input with correct length
        email="alice@example.com",          # String input
        birth_date="1990-01-01T00:00:00",   # String input but parsed to datetime
        is_active="yes"                     # String input but coerced to bool
    )
    pprint(user)

    user_all_input_types_correct = User(
        id=123,
        name="Alice",
        email="alice@example.com",
        birth_date=datetime(1990, 1, 1),
        is_active=True
    )
    pprint(user_all_input_types_correct)
    assert user == user_all_input_types_correct
except ValidationError as exc:
    print("Validation error:\n")
    pprint(exc)
User(
id=123,
name='Alice',
email='alice@example.com',
birth_date=datetime.datetime(1990, 1, 1, 0, 0),
is_active=True
)
User(
id=123,
name='Alice',
email='alice@example.com',
birth_date=datetime.datetime(1990, 1, 1, 0, 0),
is_active=True
)

Below is a failed case where the parsing and validation fails, so it shows you that actual type checking and data validation is taking place.

try:
    user = User(
        id="abc",  # Can't be parsed to int
        name=[1, 2, 3],
        email="not_an_email",
        birth_date="invalid_date",
        is_active=None
    )
    pprint(user)
except ValidationError as exc:
    print("Validation error:\n")
    pprint(exc)
Validation error:
4 validation errors for User
id
  Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='abc', input_type=str]
For further information visit https://errors.pydantic.dev/2.5/v/int_parsing
name
  Input should be a valid string [type=string_type, input_value=[1, 2, 3], input_type=list]
For further information visit https://errors.pydantic.dev/2.5/v/string_type
birth_date
  Input should be a valid datetime, invalid character in year [type=datetime_parsing, input_value='invalid_date', input_type=str]
For further information visit https://errors.pydantic.dev/2.5/v/datetime_parsing
is_active
  Input should be a valid boolean [type=bool_type, input_value=None, input_type=NoneType]
For further information visit https://errors.pydantic.dev/2.5/v/bool_type

Field Validators#

Before#

class ComplexUser(BaseModel):
    id: int
    name: str
    code: str
    status: str

Consider that your company the id all starts with the prefix ID- with unique integers following it. Then the internal parser cannot coerce the string ID-12345 into an integer. Since the integers following ID- is unique, we can just add a field_validator to extract the integer part and validate it. We would want to use a before field validator because we want our custom validation/parsing/coerce logic to happen before the default pydantic parsing logic.

try:
    model = ComplexUser(id="ID-12345", name="Prefixed ID", code="CODE_456", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)
1 validation error for ComplexUser
id
  Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='ID-12345', input_type=str]
For further information visit https://errors.pydantic.dev/2.5/v/int_parsing

To add the before validator, we can use the field_validator decorator.

class ComplexUser(BaseModel):
    id: int
    name: str
    code: str
    status: str


    @field_validator('id', mode='before')
    @classmethod
    def preprocess_id(cls: Self, v: Any) -> int:
        if isinstance(v, str) and v.startswith('ID-'):
                print(f"Preprocessing ID: {v}")
                return int(v[3:])
        return v
try:
    model = ComplexUser(id="ID-12345", name="John Doe", code="CODE_456", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)
Preprocessing ID: ID-12345
ComplexUser(id=12345, name='John Doe', code='CODE_456', status='inactive')

So we see that when the default pydantic parsing may fail, we can add before field validators to handle the parsing and validation of the raw input data first, before the default pydantic parsing logic takes over.

After#

In a similar vein, we can also add after field validators to handle the parsed data after the default pydantic parsing logic has taken place. The after field validator is useful for post-processing or additional validation on parsed data. Due to the nature of the after field validator, the parsed data is guaranteed to be of the correct type and is up to you to post-process it.

Consider the case where you want to capitalize the name field after it has been parsed. We will use .title() because we want to capitalize the first letter of each word in the string and not just the first letter of the entire string.

class ComplexUser(BaseModel):
    id: int
    name: str
    code: str
    status: str

    @field_validator('id', mode='before')
    @classmethod
    def preprocess_id(cls: Self, v: Any) -> int:
        if isinstance(v, str) and v.startswith('ID-'):
                print(f"Preprocessing ID: {v}")
                return int(v[3:])
        return v

    @field_validator('name', mode="after")
    @classmethod
    def capitalize_name(cls: Self, v: str) -> str:
        print(f"Capitalizing name: {v}")
        return v.title()

try:
    model = ComplexUser(id="ID-12345", name="john doe", code="CODE_456", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
ComplexUser(id=12345, name='John Doe', code='CODE_456', status='inactive')

We see that when user input a string that is all low caps john doe, the after field validator will capitalize the first letter of each word in the string.

However, since it happens after the validation internally, we can actually do naughty things like changing the value of the field to something else. For example, no one is stopping me from just returning a list of integers in the after field capitalize_name validator.

class ComplexUser(BaseModel):
    id: int
    name: str
    code: str
    status: str

    @field_validator('id', mode='before')
    @classmethod
    def preprocess_id(cls: Self, v: Any) -> int:
        if isinstance(v, str) and v.startswith('ID-'):
                print(f"Preprocessing ID: {v}")
                return int(v[3:])
        return v

    @field_validator('name', mode="after")
    @classmethod
    def capitalize_name(cls: Self, v: str) -> str:
        print(f"Capitalizing name: {v}")
        return [1,2,3]

try:
    model = ComplexUser(id="ID-12345", name="john doe", code="CODE_456", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
ComplexUser(id=12345, name=[1, 2, 3], code='CODE_456', status='inactive')

And the code still runs without any errors. So be careful when using after field validators as it can be used to change the value of the field to something else.

Plain#

Completely replaces Pydantic’s internal validation and is responsible for all type checking and validation. No other validators are called after this and this is useful when you need full control over validation logic.

class ComplexUser(BaseModel):
    id: int
    name: str
    code: str
    status: str

    @field_validator('id', mode='before')
    @classmethod
    def preprocess_id(cls: Self, v: Any) -> int:
        if isinstance(v, str) and v.startswith('ID-'):
                print(f"Preprocessing ID: {v}")
                return int(v[3:])
        return v

    @field_validator('name', mode="after")
    @classmethod
    def capitalize_name(cls: Self, v: str) -> str:
        print(f"Capitalizing name: {v}")
        return [1,2,3]

    @field_validator('code', mode='plain')
    @classmethod
    def validate_code(cls: Self, v: Any) -> str:
        if not isinstance(v, str) or not v.startswith('CODE_'):
            raise ValueError("Code must be a string starting with 'CODE_'")
        return v

try:
    model = ComplexUser(id="ID-12345", name="john doe", code="AAA", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)


try:
    model = ComplexUser(id="ID-12345", name="john doe", code="CODE_AAA", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
1 validation error for ComplexUser
code
  Value error, Code must be a string starting with 'CODE_' [type=value_error, input_value='AAA', input_type=str]
For further information visit https://errors.pydantic.dev/2.5/v/value_error
Preprocessing ID: ID-12345
Capitalizing name: john doe
ComplexUser(id=12345, name=[1, 2, 3], code='CODE_AAA', status='inactive')

Wrap#

See discussion here to get a glimpse of how to use wrap validator.

  • Can run code before and after Pydantic’s internal validation

  • Receives a handler function to call the inner validator

  • Can modify input before validation and output after validation

  • Can catch and handle validation errors from inner validators

class ComplexUser(BaseModel):
    id: int
    name: str
    code: str
    status: str

    @field_validator('id', mode='before')
    @classmethod
    def preprocess_id(cls: Self, v: Any) -> int:
        if isinstance(v, str) and v.startswith('ID-'):
                print(f"Preprocessing ID: {v}")
                return int(v[3:])
        return v

    @field_validator('name', mode="after")
    @classmethod
    def capitalize_name(cls: Self, v: str) -> str:
        print(f"Capitalizing name: {v}")
        return [1,2,3]

    @field_validator('code', mode='plain')
    @classmethod
    def validate_code(cls: Self, v: Any) -> str:
        if not isinstance(v, str) or not v.startswith('CODE_'):
            raise ValueError("Code must be a string starting with 'CODE_'")
        return v

    @field_validator('status', mode='wrap')
    @classmethod
    def validate_status(cls, value: Any, handler: Any, info: ValidationInfo) -> str:
        # pre-processing
        if isinstance(value, str):
            value = value.upper()

        # inner validator
        try:
            validated = handler(value)
            pprint(validated)
        except ValueError as exc:
            raise ValueError(f"Invalid status: {exc}") from exc

        # post-processing
        if validated not in ['ACTIVE', 'INACTIVE']:
            raise ValueError("Status must be either 'ACTIVE' or 'INACTIVE'")

        return validated
try:
    model = ComplexUser(id="ID-12345", name="john doe", code="CODE_AAA", status="inactive")
    pprint(model)
except ValidationError as exc:
    pprint(exc)
Preprocessing ID: ID-12345
Capitalizing name: john doe
'INACTIVE'
ComplexUser(id=12345, name=[1, 2, 3], code='CODE_AAA', status='INACTIVE')

Dynamic Model Creation#

Pydantic allows you to dynamically create models at runtime using the create_model function. This is useful when you want to create a model based on some configuration or input.

import functools
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, TypeVar, Union, get_args, get_origin

from pydantic import BaseModel, Field, create_model
from rich.pretty import pprint
from typing_extensions import ParamSpec

# https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
P = ParamSpec("P")
T = TypeVar("T")


def trace(func: Callable[P, T]) -> Callable[P, T]:
    """Decorator to log function calls."""

    @functools.wraps(func)  # This copies the metadata of `func` to `wrapper`
    def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
        """Me is wrapper."""
        result = func(*args, **kwargs)
        print(f"{func.__name__}({args!r}, {kwargs!r}) " f"-> {result!r}")
        return result

    return wrapper


Annotation = Union[Type[Any], None]


# @trace
def generate_dynamic_model(schema: Dict[str, Dict[str, Any]], model_name: str = "DynamicModel") -> Type[BaseModel]:
    fields: Dict[str, Any] = {}
    for field_name, field_info in schema.items():
        field_type = field_info["type"]
        field_type = resolve_field_type(field_type, field_info, field_name)

        if field_info.get("optional", False):
            field_type = Optional[field_type]

        default_value = field_info.get("default", ...)
        field_metadata = {
            k: v for k, v in field_info.items() if k not in ["type", "default", "optional", "properties", "items"]
        }
        fields[field_name] = (
            field_type,
            Field(default=default_value, **field_metadata),
        )

    return create_model(model_name, **fields)


def resolve_field_type(field_type: Type[Any], field_info: Dict[str, Any], field_name: str) -> Union[Type[Any], Any]:
    origin = get_origin(field_type)
    args = get_args(field_type)

    if field_type is object or (isinstance(field_type, type) and issubclass(field_type, dict)):
        nested_schema = field_info.get("properties", {})
        return generate_dynamic_model(schema=nested_schema, model_name=f"{field_name.title()}")

    elif origin is list:
        if not args:
            return List[Any]
        item_type = resolve_field_type(args[0], field_info.get("items", {}), f"{field_name}")
        return List[item_type]  # type: ignore[valid-type]

    elif origin is dict:
        key_type = args[0] if args else Any
        value_type = resolve_field_type(
            args[1] if len(args) > 1 else Any,
            field_info.get("additionalProperties", {}),
            f"{field_name}",
        )
        return Dict[key_type, value_type]  # type: ignore[valid-type]

    elif origin is Union:
        resolved_args = tuple(resolve_field_type(arg, {}, f"{field_name}") for arg in args)
        return Union[resolved_args]

    return field_type

Schema 1. Basic Schema#

basic_schema = {
    "a": {"type": int, "optional": True, "default": None},
    "b": {"type": str, "optional": True, "default": None},
    "c": {"type": List[str], "optional": False, "default": None},
    "d": {"type": Dict[str, int], "optional": False, "default": None},
    "e": {"type": Tuple[int, str, List[str]], "optional": False, "default": None},
    "f": {"type": Union[int, str], "optional": False, "default": None},
    "g": {"type": Union[int, None], "optional": False, "default": None},
    "h": {"type": Any, "optional": False, "default": None},
    "i": {"type": Literal["a", "b", 3], "optional": False, "default": None},
    "j": {"type": Optional[int], "optional": True, "default": None},
}

DynamicModel = generate_dynamic_model(basic_schema, model_name="DynamicModel")
dynamic_model = DynamicModel(
    a=1,
    b="hello",
    c=["world"],
    d={"key": 1},
    e=(1, "hello", ["world"]),
    f=1,
    g=100,
    h=1.2,
    i="a",
    j=None,
)
pprint(dynamic_model)
pprint(dynamic_model.model_fields)
pprint(dynamic_model.model_json_schema())
pprint(dynamic_model.model_dump())
DynamicModel(
a=1,
b='hello',
c=['world'],
d={'key': 1},
e=(1, 'hello', ['world']),
f=1,
g=100,
h=1.2,
i='a',
j=None
)
{
'a': FieldInfo(annotation=Union[int, NoneType], required=False),
'b': FieldInfo(annotation=Union[str, NoneType], required=False),
'c': FieldInfo(annotation=List[str], required=False),
'd': FieldInfo(annotation=Dict[str, int], required=False),
'e': FieldInfo(annotation=Tuple[int, str, List[str]], required=False),
'f': FieldInfo(annotation=Union[int, str], required=False),
'g': FieldInfo(annotation=Union[int, NoneType], required=False),
'h': FieldInfo(annotation=Any, required=False),
'i': FieldInfo(annotation=Literal['a', 'b', 3], required=False),
'j': FieldInfo(annotation=Union[int, NoneType], required=False)
}
{
'properties': {
│   │   'a': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'A'},
│   │   'b': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'B'},
│   │   'c': {'default': None, 'items': {'type': 'string'}, 'title': 'C', 'type': 'array'},
│   │   'd': {'additionalProperties': {'type': 'integer'}, 'default': None, 'title': 'D', 'type': 'object'},
│   │   'e': {
│   │   │   'default': None,
│   │   │   'maxItems': 3,
│   │   │   'minItems': 3,
│   │   │   'prefixItems': [
│   │   │   │   {'type': 'integer'},
│   │   │   │   {'type': 'string'},
│   │   │   │   {'items': {'type': 'string'}, 'type': 'array'}
│   │   │   ],
│   │   │   'title': 'E',
│   │   │   'type': 'array'
│   │   },
│   │   'f': {'anyOf': [{'type': 'integer'}, {'type': 'string'}], 'default': None, 'title': 'F'},
│   │   'g': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'G'},
│   │   'h': {'default': None, 'title': 'H'},
│   │   'i': {'default': None, 'enum': ['a', 'b', 3], 'title': 'I'},
│   │   'j': {'anyOf': [{'type': 'integer'}, {'type': 'null'}], 'default': None, 'title': 'J'}
},
'title': 'DynamicModel',
'type': 'object'
}
{
'a': 1,
'b': 'hello',
'c': ['world'],
'd': {'key': 1},
'e': (1, 'hello', ['world']),
'f': 1,
'g': 100,
'h': 1.2,
'i': 'a',
'j': None
}

Schema 2. User Schema With Nested Properties#

user_schema = {
    "name": {"type": str, "description": "The user's full name"},
    "age": {
        "type": int,
        "optional": False,
        "default": None,
        "description": "The user's age in years",
    },
    "address": {
        "type": object,
        "properties": {
            "street": {"type": str, "description": "Street name and number"},
            "city": {"type": str, "description": "City name"},
            "postal_code": {"type": int, "description": "Postal code or ZIP code"},
        },
        "description": "The user's mailing address",
        "optional": True,
    },
    "phone_numbers": {"type": List[str], "description": "List of user's phone numbers"},
    "tags": {
        "type": List[object],
        "items": {
            "properties": {
                "name": {"type": str, "description": "Tag name"},
                "value": {"type": int, "description": "Tag value"},
            }
        },
        "description": "List of user tags",
    },
    "pet": {
        "type": object,
        "properties": {
            "name": {"type": str, "description": "Pet name"},
            "age": {"type": int, "description": "Pet age in years"},
        },
        "description": "User's pet",
    },
}

User = generate_dynamic_model(user_schema, model_name="User")

user = User(
    name="John Doe",
    age=20,
    address={"street": "123 Main St", "city": "Anytown", "postal_code": 12345},
    phone_numbers=["1234567890", "0987654321"],
    tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
    pet={"name": "Fluffy", "age": 2},
)

pprint(user)

pprint(user.model_dump())
pprint(user.model_fields)
pprint(user.model_json_schema())
User(
name='John Doe',
age=20,
address=Address(street='123 Main St', city='Anytown', postal_code=12345),
phone_numbers=['1234567890', '0987654321'],
tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)],
pet=Pet(name='Fluffy', age=2)
)
{
'name': 'John Doe',
'age': 20,
'address': {'street': '123 Main St', 'city': 'Anytown', 'postal_code': 12345},
'phone_numbers': ['1234567890', '0987654321'],
'tags': [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}],
'pet': {'name': 'Fluffy', 'age': 2}
}
{
'name': FieldInfo(annotation=str, required=True, description="The user's full name"),
'age': FieldInfo(annotation=int, required=False, description="The user's age in years"),
'address': FieldInfo(
│   │   annotation=Union[Address, NoneType],
│   │   required=True,
│   │   description="The user's mailing address"
),
'phone_numbers': FieldInfo(annotation=List[str], required=True, description="List of user's phone numbers"),
'tags': FieldInfo(annotation=List[__main__.Tags], required=True, description='List of user tags'),
'pet': FieldInfo(annotation=Pet, required=True, description="User's pet")
}
{
'$defs': {
│   │   'Address': {
│   │   │   'properties': {
│   │   │   │   'street': {'description': 'Street name and number', 'title': 'Street', 'type': 'string'},
│   │   │   │   'city': {'description': 'City name', 'title': 'City', 'type': 'string'},
│   │   │   │   'postal_code': {
│   │   │   │   │   'description': 'Postal code or ZIP code',
│   │   │   │   │   'title': 'Postal Code',
│   │   │   │   │   'type': 'integer'
│   │   │   │   }
│   │   │   },
│   │   │   'required': ['street', 'city', 'postal_code'],
│   │   │   'title': 'Address',
│   │   │   'type': 'object'
│   │   },
│   │   'Pet': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Pet name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'age': {'description': 'Pet age in years', 'title': 'Age', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'age'],
│   │   │   'title': 'Pet',
│   │   │   'type': 'object'
│   │   },
│   │   'Tags': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'value'],
│   │   │   'title': 'Tags',
│   │   │   'type': 'object'
│   │   }
},
'properties': {
│   │   'name': {'description': "The user's full name", 'title': 'Name', 'type': 'string'},
│   │   'age': {'default': None, 'description': "The user's age in years", 'title': 'Age', 'type': 'integer'},
│   │   'address': {
│   │   │   'anyOf': [{'$ref': '#/$defs/Address'}, {'type': 'null'}],
│   │   │   'description': "The user's mailing address"
│   │   },
│   │   'phone_numbers': {
│   │   │   'description': "List of user's phone numbers",
│   │   │   'items': {'type': 'string'},
│   │   │   'title': 'Phone Numbers',
│   │   │   'type': 'array'
│   │   },
│   │   'tags': {
│   │   │   'description': 'List of user tags',
│   │   │   'items': {'$ref': '#/$defs/Tags'},
│   │   │   'title': 'Tags',
│   │   │   'type': 'array'
│   │   },
│   │   'pet': {'allOf': [{'$ref': '#/$defs/Pet'}], 'description': "User's pet"}
},
'required': ['name', 'address', 'phone_numbers', 'tags', 'pet'],
'title': 'User',
'type': 'object'
}

What We Did Not Handle?#

First, we note that we can handle nested lists such as List[List[object]] where object is a complex type that is not part of those basic types (i.e. int, str, list, dict, tuple). Here, we added tagss to the schema where we expect it to become List[List[Tagss]], which we handled below:

elif origin is list:
    if not args:
        return List[Any]
    item_type = resolve_field_type(args[0], field_info.get("items", {}), f"{field_name}")
    return List[item_type]  # type: ignore[valid-type]
user_schema.update(
    {
        "tagss": {
            "type": List[List[object]],
            "optional": False,
            "items": {
                "type": List[object],
                "items": {
                    "type": object,
                    "properties": {
                        "name": {"type": str, "description": "Tag name"},
                        "value": {"type": int, "description": "Tag value"},
                    },
                },
            },
        },
    }
)

User = generate_dynamic_model(user_schema, model_name="User")

user = User(
    name="John Doe",
    age=20,
    address={"street": "123 Main St", "city": "Anytown", "postal_code": 12345},
    phone_numbers=["1234567890", "0987654321"],
    tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
    pet={"name": "Fluffy", "age": 2},
    tagss=[
        [{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
        [{"name": "tag3", "value": 3}, {"name": "tag4", "value": 4}],
    ],
)

pprint(user)

pprint(user.model_dump())
pprint(user.model_fields)
pprint(user.model_json_schema())
User(
name='John Doe',
age=20,
address=Address(street='123 Main St', city='Anytown', postal_code=12345),
phone_numbers=['1234567890', '0987654321'],
tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)],
pet=Pet(name='Fluffy', age=2),
tagss=[
│   │   [Tagss(name='tag1', value=1), Tagss(name='tag2', value=2)],
│   │   [Tagss(name='tag3', value=3), Tagss(name='tag4', value=4)]
]
)
{
'name': 'John Doe',
'age': 20,
'address': {'street': '123 Main St', 'city': 'Anytown', 'postal_code': 12345},
'phone_numbers': ['1234567890', '0987654321'],
'tags': [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}],
'pet': {'name': 'Fluffy', 'age': 2},
'tagss': [
│   │   [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}],
│   │   [{'name': 'tag3', 'value': 3}, {'name': 'tag4', 'value': 4}]
]
}
{
'name': FieldInfo(annotation=str, required=True, description="The user's full name"),
'age': FieldInfo(annotation=int, required=False, description="The user's age in years"),
'address': FieldInfo(
│   │   annotation=Union[Address, NoneType],
│   │   required=True,
│   │   description="The user's mailing address"
),
'phone_numbers': FieldInfo(annotation=List[str], required=True, description="List of user's phone numbers"),
'tags': FieldInfo(annotation=List[__main__.Tags], required=True, description='List of user tags'),
'pet': FieldInfo(annotation=Pet, required=True, description="User's pet"),
'tagss': FieldInfo(annotation=List[List[__main__.Tagss]], required=True)
}
{
'$defs': {
│   │   'Address': {
│   │   │   'properties': {
│   │   │   │   'street': {'description': 'Street name and number', 'title': 'Street', 'type': 'string'},
│   │   │   │   'city': {'description': 'City name', 'title': 'City', 'type': 'string'},
│   │   │   │   'postal_code': {
│   │   │   │   │   'description': 'Postal code or ZIP code',
│   │   │   │   │   'title': 'Postal Code',
│   │   │   │   │   'type': 'integer'
│   │   │   │   }
│   │   │   },
│   │   │   'required': ['street', 'city', 'postal_code'],
│   │   │   'title': 'Address',
│   │   │   'type': 'object'
│   │   },
│   │   'Pet': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Pet name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'age': {'description': 'Pet age in years', 'title': 'Age', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'age'],
│   │   │   'title': 'Pet',
│   │   │   'type': 'object'
│   │   },
│   │   'Tags': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'value'],
│   │   │   'title': 'Tags',
│   │   │   'type': 'object'
│   │   },
│   │   'Tagss': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'value'],
│   │   │   'title': 'Tagss',
│   │   │   'type': 'object'
│   │   }
},
'properties': {
│   │   'name': {'description': "The user's full name", 'title': 'Name', 'type': 'string'},
│   │   'age': {'default': None, 'description': "The user's age in years", 'title': 'Age', 'type': 'integer'},
│   │   'address': {
│   │   │   'anyOf': [{'$ref': '#/$defs/Address'}, {'type': 'null'}],
│   │   │   'description': "The user's mailing address"
│   │   },
│   │   'phone_numbers': {
│   │   │   'description': "List of user's phone numbers",
│   │   │   'items': {'type': 'string'},
│   │   │   'title': 'Phone Numbers',
│   │   │   'type': 'array'
│   │   },
│   │   'tags': {
│   │   │   'description': 'List of user tags',
│   │   │   'items': {'$ref': '#/$defs/Tags'},
│   │   │   'title': 'Tags',
│   │   │   'type': 'array'
│   │   },
│   │   'pet': {'allOf': [{'$ref': '#/$defs/Pet'}], 'description': "User's pet"},
│   │   'tagss': {
│   │   │   'items': {'items': {'$ref': '#/$defs/Tagss'}, 'type': 'array'},
│   │   │   'title': 'Tagss',
│   │   │   'type': 'array'
│   │   }
},
'required': ['name', 'address', 'phone_numbers', 'tags', 'pet', 'tagss'],
'title': 'User',
'type': 'object'
}

We can also define a model in python and pass it to the json/dict schema.

class Movie(BaseModel):
    title: str
    year: int
    genre: List[str]

user_schema.update(
    {
        "movies": {
            "type": List[Movie],
            "optional": False,
            "default": None,
        }
    }
)

User = generate_dynamic_model(user_schema, model_name="User")
user = User(
    name="John Doe",
    age=20,
    address={"street": "123 Main St", "city": "Anytown", "postal_code": 12345},
    phone_numbers=["1234567890", "0987654321"],
    tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
    pet={"name": "Fluffy", "age": 2},
    tagss=[
        [{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
        [{"name": "tag3", "value": 3}, {"name": "tag4", "value": 4}],
    ],
    movies=[Movie(title="The Matrix", year=1999, genre=["Action", "Sci-Fi"]), Movie(title="The Matrix Reloaded", year=2003, genre=["Action", "Sci-Fi"])],
)

pprint(user)
pprint(user.model_dump())
pprint(user.model_fields)
pprint(user.model_json_schema())
User(
name='John Doe',
age=20,
address=Address(street='123 Main St', city='Anytown', postal_code=12345),
phone_numbers=['1234567890', '0987654321'],
tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)],
pet=Pet(name='Fluffy', age=2),
tagss=[
│   │   [Tagss(name='tag1', value=1), Tagss(name='tag2', value=2)],
│   │   [Tagss(name='tag3', value=3), Tagss(name='tag4', value=4)]
],
movies=[
│   │   Movie(title='The Matrix', year=1999, genre=['Action', 'Sci-Fi']),
│   │   Movie(title='The Matrix Reloaded', year=2003, genre=['Action', 'Sci-Fi'])
]
)
{
'name': 'John Doe',
'age': 20,
'address': {'street': '123 Main St', 'city': 'Anytown', 'postal_code': 12345},
'phone_numbers': ['1234567890', '0987654321'],
'tags': [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}],
'pet': {'name': 'Fluffy', 'age': 2},
'tagss': [
│   │   [{'name': 'tag1', 'value': 1}, {'name': 'tag2', 'value': 2}],
│   │   [{'name': 'tag3', 'value': 3}, {'name': 'tag4', 'value': 4}]
],
'movies': [
│   │   {'title': 'The Matrix', 'year': 1999, 'genre': ['Action', 'Sci-Fi']},
│   │   {'title': 'The Matrix Reloaded', 'year': 2003, 'genre': ['Action', 'Sci-Fi']}
]
}
{
'name': FieldInfo(annotation=str, required=True, description="The user's full name"),
'age': FieldInfo(annotation=int, required=False, description="The user's age in years"),
'address': FieldInfo(
│   │   annotation=Union[Address, NoneType],
│   │   required=True,
│   │   description="The user's mailing address"
),
'phone_numbers': FieldInfo(annotation=List[str], required=True, description="List of user's phone numbers"),
'tags': FieldInfo(annotation=List[__main__.Tags], required=True, description='List of user tags'),
'pet': FieldInfo(annotation=Pet, required=True, description="User's pet"),
'tagss': FieldInfo(annotation=List[List[__main__.Tagss]], required=True),
'movies': FieldInfo(annotation=List[__main__.Movie], required=False)
}
{
'$defs': {
│   │   'Address': {
│   │   │   'properties': {
│   │   │   │   'street': {'description': 'Street name and number', 'title': 'Street', 'type': 'string'},
│   │   │   │   'city': {'description': 'City name', 'title': 'City', 'type': 'string'},
│   │   │   │   'postal_code': {
│   │   │   │   │   'description': 'Postal code or ZIP code',
│   │   │   │   │   'title': 'Postal Code',
│   │   │   │   │   'type': 'integer'
│   │   │   │   }
│   │   │   },
│   │   │   'required': ['street', 'city', 'postal_code'],
│   │   │   'title': 'Address',
│   │   │   'type': 'object'
│   │   },
│   │   'Movie': {
│   │   │   'properties': {
│   │   │   │   'title': {'title': 'Title', 'type': 'string'},
│   │   │   │   'year': {'title': 'Year', 'type': 'integer'},
│   │   │   │   'genre': {'items': {'type': 'string'}, 'title': 'Genre', 'type': 'array'}
│   │   │   },
│   │   │   'required': ['title', 'year', 'genre'],
│   │   │   'title': 'Movie',
│   │   │   'type': 'object'
│   │   },
│   │   'Pet': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Pet name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'age': {'description': 'Pet age in years', 'title': 'Age', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'age'],
│   │   │   'title': 'Pet',
│   │   │   'type': 'object'
│   │   },
│   │   'Tags': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'value'],
│   │   │   'title': 'Tags',
│   │   │   'type': 'object'
│   │   },
│   │   'Tagss': {
│   │   │   'properties': {
│   │   │   │   'name': {'description': 'Tag name', 'title': 'Name', 'type': 'string'},
│   │   │   │   'value': {'description': 'Tag value', 'title': 'Value', 'type': 'integer'}
│   │   │   },
│   │   │   'required': ['name', 'value'],
│   │   │   'title': 'Tagss',
│   │   │   'type': 'object'
│   │   }
},
'properties': {
│   │   'name': {'description': "The user's full name", 'title': 'Name', 'type': 'string'},
│   │   'age': {'default': None, 'description': "The user's age in years", 'title': 'Age', 'type': 'integer'},
│   │   'address': {
│   │   │   'anyOf': [{'$ref': '#/$defs/Address'}, {'type': 'null'}],
│   │   │   'description': "The user's mailing address"
│   │   },
│   │   'phone_numbers': {
│   │   │   'description': "List of user's phone numbers",
│   │   │   'items': {'type': 'string'},
│   │   │   'title': 'Phone Numbers',
│   │   │   'type': 'array'
│   │   },
│   │   'tags': {
│   │   │   'description': 'List of user tags',
│   │   │   'items': {'$ref': '#/$defs/Tags'},
│   │   │   'title': 'Tags',
│   │   │   'type': 'array'
│   │   },
│   │   'pet': {'allOf': [{'$ref': '#/$defs/Pet'}], 'description': "User's pet"},
│   │   'tagss': {
│   │   │   'items': {'items': {'$ref': '#/$defs/Tagss'}, 'type': 'array'},
│   │   │   'title': 'Tagss',
│   │   │   'type': 'array'
│   │   },
│   │   'movies': {'default': None, 'items': {'$ref': '#/$defs/Movie'}, 'title': 'Movies', 'type': 'array'}
},
'required': ['name', 'address', 'phone_numbers', 'tags', 'pet', 'tagss'],
'title': 'User',
'type': 'object'
}

What we have issue doing is the below, where we define a complex dictionary with say, the value as another complex object.

violated_schema =     {
        "tagsss": {
            "type": Dict[str, object],
            "optional": False,
        "additionalProperties": {
            "type": object,
            "properties": {
                "name": {"type": str, "description": "Tag name"},
                "value": {"type": int, "description": "Tag value"},
            },
            },
        },
    }

Violated = generate_dynamic_model(violated_schema, model_name="Violated")
violated = Violated(tagsss={"key": {"name": "tag1", "value": 1}})
pprint(violated)
pprint(violated.model_dump())
pprint(violated.model_fields)

try:
    pprint(violated.model_json_schema())
except Exception as exc:
    pprint(exc)
Violated(tagsss={'key': Tagsss(name='tag1', value=1)})
{'tagsss': {'key': {'name': 'tag1', 'value': 1}}}
{
'tagsss': FieldInfo(
│   │   annotation=Dict[str, __main__.Tagsss],
│   │   required=True,
│   │   json_schema_extra={
│   │   │   'additionalProperties': {
│   │   │   │   'type': <class 'object'>,
│   │   │   │   'properties': {
│   │   │   │   │   'name': {'type': <class 'str'>, 'description': 'Tag name'},
│   │   │   │   │   'value': {'type': <class 'int'>, 'description': 'Tag value'}
│   │   │   │   }
│   │   │   }
│   │   }
)
}
PydanticSerializationError(Unable to serialize unknown type: <class 'type'>)

Why does the Pydantic model able to correctly infer the schema of the nested dictionary with arbitrary nesting and still conform to the schema but unable to serialize it? I did not dig deep but I suspect there’s some good amount of coercion happening in the background. But then one big reason with our code is our additionalProperties is not being handled properly.

{
   'tagsss': FieldInfo(
      annotation=Dict[str, __main__.Tagsss],
      required=True,
      json_schema_extra={
         'additionalProperties': {
            'type': <class 'object'>,
            'properties': {
               'name': {'type': <class 'str'>, 'description': 'Tag name'},
               'value': {'type': <class 'int'>, 'description': 'Tag value'}
            }
         }
      }
   )
}

Our type should be a string, but it is a class with the types. So <class 'str'> should just be str. For now, we do not handle this.

Moreover, there are many many scenario of nested complex objects that we did not handle. We do not reinvent the wheel as datamodel-code-generator does the job well enough. Note that creating model from a json schema has been requested, and the author of Pydantic personally suggested trying datamodel-code-generator as Pydantic does not support it yet.

What is get_origin and get_args?#

Consider you want to check the type like a = [1, 2, 3] you would generally do type(a) and you get the type list. However, now consider this, if you set tp = List[str] as a type alias assignment, then type(tp) will still return typing._GenericAlias and in order to get the type list you have to do tp.__origin__ and tp.__args__ where the former is the origin of the type and the latter is the arguments of the type, in which case it is (str,).

The get_origin function does exactly this under the hood to get the origin of the type. So get_origin(List[str]) is list and get_args(List[str]) is (str,).

type(List[str]), List[str].__origin__, List[str].__args__
(typing._GenericAlias, list, (str,))
get_origin(List[str]), get_args(List[str])
(list, (str,))
type(List[Dict[str, Any]]), List[Dict[str, Any]].__origin__, List[Dict[str, Any]].__args__
(typing._GenericAlias, list, (typing.Dict[str, typing.Any],))
get_origin(List[Dict[str, Any]]), get_args(List[Dict[str, Any]])
(list, (typing.Dict[str, typing.Any],))

Note that isinstance(get_origin(List[str]), list) is False because,

  1. get_origin(List[str]) returns the origin of the List type, which is list.

  2. isinstance(list, list) is actually False because list is a type, not an instance of list.

get_origin(List[Dict[str, Any]]), type(List[Dict[str, Any]]), isinstance(get_origin(List[Dict[str, Any]]), list), isinstance(list, list)
(list, typing._GenericAlias, False, False)
get_origin(List["object"]), get_args(List["object"])
(list, (ForwardRef('object'),))

From Yaml#

Currently we define our schema in python, this is a very easy way because we can define real types like List[str] and Dict[str, int] but in yaml based config these will become string.

from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import yaml


def str_to_type(type_str: str) -> Any:
    type_str = type_str.strip()

    basic_types = {
        "str": str,
        "int": int,
        "float": float,
        "bool": bool,
        "Any": Any,
        "None": type(None),
        "null": type(None),
        "object": dict,
    }
    if type_str in basic_types:
        return basic_types[type_str]

    complex_types = {
        "List": lambda t: List[str_to_type(t)],  # type: ignore[misc]
        "Dict": lambda t: Dict[str_to_type(t.split(",")[0]), str_to_type(t.split(",")[1])],  # type: ignore[misc]
        "Tuple": lambda t: Tuple[tuple(map(str_to_type, t.split(",")))],
        "Union": lambda t: Union[tuple(map(str_to_type, t.split(",")))],
        "Optional": lambda t: Optional[str_to_type(t)],
        "Literal": lambda t: Literal[eval(t)],
    }
    for type_name, type_func in complex_types.items():
        if type_str.startswith(f"{type_name}["):
            inner_type = type_str[len(type_name) + 1 : -1]
            return type_func(inner_type)

    raise ValueError(f"Unknown type: {type_str}")


def process_schema(schema: Dict[str, Any]) -> Dict[str, Any]:
    processed_schema = {}
    for key, value in schema.items():
        if isinstance(value, dict):
            if "type" in value:
                value["type"] = str_to_type(value["type"])
            if "properties" in value:
                value["properties"] = process_schema(value["properties"])
            if "items" in value and "properties" in value["items"]:
                value["items"]["properties"] = process_schema(value["items"]["properties"])
        processed_schema[key] = value
    return processed_schema

with open("./assets/schema.yaml", "r") as file:
    yaml_schema = yaml.safe_load(file)

pprint(yaml_schema)
processed_schema = process_schema(yaml_schema)
pprint(processed_schema)

User = generate_dynamic_model(processed_schema, "User")
user = User(
    name="John Doe",
    age=30,
    address={"street": "123 Main St", "city": "New York", "postal_code": 10001},
    phone_numbers=["123-456-7890", "098-765-4321"],
    tags=[{"name": "tag1", "value": 1}, {"name": "tag2", "value": 2}],
    pet={"name": "Buddy", "age": 5, "species": "Dog"},
)
pprint(user)
{
'name': {'type': 'str', 'description': "The user's full name"},
'age': {'type': 'int', 'optional': False, 'default': None, 'description': "The user's age in years"},
'address': {
│   │   'type': 'object',
│   │   'properties': {
│   │   │   'street': {'type': 'str', 'description': 'Street name and number'},
│   │   │   'city': {'type': 'str', 'description': 'City name'},
│   │   │   'postal_code': {'type': 'int', 'description': 'Postal code or ZIP code'}
│   │   },
│   │   'description': "The user's mailing address",
│   │   'optional': True
},
'phone_numbers': {'type': 'List[str]', 'description': "List of user's phone numbers"},
'tags': {
│   │   'type': 'List[object]',
│   │   'items': {
│   │   │   'properties': {
│   │   │   │   'name': {'type': 'str', 'description': 'Tag name'},
│   │   │   │   'value': {'type': 'int', 'description': 'Tag value'}
│   │   │   }
│   │   },
│   │   'description': 'List of user tags'
},
'pet': {
│   │   'type': 'object',
│   │   'properties': {
│   │   │   'name': {'type': 'str', 'description': 'Pet name'},
│   │   │   'age': {'type': 'int', 'description': 'Pet age in years'}
│   │   },
│   │   'description': "User's pet"
}
}
{
'name': {'type': <class 'str'>, 'description': "The user's full name"},
'age': {'type': <class 'int'>, 'optional': False, 'default': None, 'description': "The user's age in years"},
'address': {
│   │   'type': <class 'dict'>,
│   │   'properties': {
│   │   │   'street': {'type': <class 'str'>, 'description': 'Street name and number'},
│   │   │   'city': {'type': <class 'str'>, 'description': 'City name'},
│   │   │   'postal_code': {'type': <class 'int'>, 'description': 'Postal code or ZIP code'}
│   │   },
│   │   'description': "The user's mailing address",
│   │   'optional': True
},
'phone_numbers': {'type': typing.List[str], 'description': "List of user's phone numbers"},
'tags': {
│   │   'type': typing.List[dict],
│   │   'items': {
│   │   │   'properties': {
│   │   │   │   'name': {'type': <class 'str'>, 'description': 'Tag name'},
│   │   │   │   'value': {'type': <class 'int'>, 'description': 'Tag value'}
│   │   │   }
│   │   },
│   │   'description': 'List of user tags'
},
'pet': {
│   │   'type': <class 'dict'>,
│   │   'properties': {
│   │   │   'name': {'type': <class 'str'>, 'description': 'Pet name'},
│   │   │   'age': {'type': <class 'int'>, 'description': 'Pet age in years'}
│   │   },
│   │   'description': "User's pet"
}
}
User(
name='John Doe',
age=30,
address=Address(street='123 Main St', city='New York', postal_code=10001),
phone_numbers=['123-456-7890', '098-765-4321'],
tags=[Tags(name='tag1', value=1), Tags(name='tag2', value=2)],
pet=Pet(name='Buddy', age=5)
)