Generate PySpark Schema dynamically in Python from JSON Sample

If you need to genarate pyspark schema from JSON you can always my tool here

but if you need to do it in Python then here is the code snippet for it. It takes a python dictionary as input and generates the PySpark schema.

import json
from pyspark.sql.types import *

def GeneratePySparkSchema(json):
    fields = []
    for key, value in json.items():
        if isinstance(value, dict):
            field = StructField(key, GeneratePySparkSchema(value), True)
        elif isinstance(value, list):
            if len(value) == 0:
                field = StructField(key, ArrayType(StringType()), True)
            elif isinstance(value[0], dict):
                field = StructField(
                    key, ArrayType(GeneratePySparkSchema(value[0]), True)
                field = StructField(key, ArrayType(GetSparkDataType(value[0]), True))
            field = StructField(key, GetSparkDataType(value), True)
    return StructType(fields)

def GetSparkDataType(value):
    if isinstance(value, str):
        return StringType()
    elif isinstance(value, bool):
        return BooleanType()
    elif isinstance(value, int):
        return IntegerType()
    elif isinstance(value, float):
        return DoubleType()
        return StringType()

json_string = """
    "id": 21,
    "name": "PREETish",
    "project": "PySpark Schema Generator",
    "details": {
        "link": "",
        "developer": "Pritish Ranjan"

json_object = json.loads(json_string)
schema = GeneratePySparkSchema(json_object)


Thanks for reading!!


