Generate PySpark Schema dynamically in Python from JSON Sample

 Hi Folks,

If you need to genarate pyspark schema from JSON you can always my tool here

but if you need to do it in Python then here is the code snippet for it. It takes a python dictionary as input and generates the PySpark schema.

import json
from pyspark.sql.types import *

def GeneratePySparkSchema(json):
    fields = []
    for key, value in json.items():
        if isinstance(value, dict):
            field = StructField(key, GeneratePySparkSchema(value), True)
        elif isinstance(value, list):
            if len(value) == 0:
                field = StructField(key, ArrayType(StringType()), True)
            elif isinstance(value[0], dict):
                field = StructField(
                    key, ArrayType(GeneratePySparkSchema(value[0]), True)
                field = StructField(key, ArrayType(GetSparkDataType(value[0]), True))
            field = StructField(key, GetSparkDataType(value), True)
    return StructType(fields)

def GetSparkDataType(value):
    if isinstance(value, str):
        return StringType()
    elif isinstance(value, bool):
        return BooleanType()
    elif isinstance(value, int):
        return IntegerType()
    elif isinstance(value, float):
        return DoubleType()
        return StringType()

json_string = """
    "id": 21,
    "name": "PREETish",
    "project": "PySpark Schema Generator",
    "details": {
        "link": "",
        "developer": "Pritish Ranjan"

json_object = json.loads(json_string)
schema = GeneratePySparkSchema(json_object)


Thanks for reading!!


Popular posts from this blog

Use SCSS with ASP.NET Core 5.x or 3.X

Building a Login Flow with .NET MAUI

PySpark Schema Generator - A simple tool to generate PySpark schema from JSON data