df = spark.createDataFrame(
[
['red', 'banana', 1, 10],
['blue', 'banana', 2, 20],
['red', 'carrot', 3, 30],
['blue', 'grape', 4, 40],
['red', 'carrot', 5, 50],
['black', 'carrot', 6, 60],
['red', 'banana', 7, 70],
['red', 'grape', 8, 80]
],
schema=['color', 'fruit', 'v1', 'v2']
)
df.show()
df = spark.read.json("resources/zipcodes.json") # must be jsonl
df.groupby('color').avg().show()