Generate PySpark Schema dynamically in Python from JSON Sample
Hi Folks,
If you need to genarate pyspark schema from JSON you can always my tool here https://preetranjan.github.io/pyspark-schema-generator/
but if you need to do it in Python then here is the code snippet for it. It takes a python dictionary as input and generates the PySpark schema.
import json
from pyspark.sql.types import *
def GeneratePySparkSchema(json):
fields = []
for key, value in json.items():
if isinstance(value, dict):
field = StructField(key, GeneratePySparkSchema(value), True)
elif isinstance(value, list):
if len(value) == 0:
field = StructField(key, ArrayType(StringType()), True)
elif isinstance(value[0], dict):
field = StructField(
key, ArrayType(GeneratePySparkSchema(value[0]), True)
)
else:
field = StructField(key, ArrayType(GetSparkDataType(value[0]), True))
else:
field = StructField(key, GetSparkDataType(value), True)
fields.append(field)
return StructType(fields)
def GetSparkDataType(value):
if isinstance(value, str):
return StringType()
elif isinstance(value, bool):
return BooleanType()
elif isinstance(value, int):
return IntegerType()
elif isinstance(value, float):
return DoubleType()
else:
return StringType()
json_string = """
{
"id": 21,
"name": "PREETish",
"project": "PySpark Schema Generator",
"details": {
"link": "https://preetranjan.github.io/pyspark-schema-generator/",
"developer": "Pritish Ranjan"
},
"mobiles":[7878787878,5656565656]
}
"""
json_object = json.loads(json_string)
schema = GeneratePySparkSchema(json_object)
print(schema)
Thanks for reading!!
Comments
Post a Comment