Sudheer Keshav Bhat · October 17, 2021
j_df = spark.read.json("path_to/params.json")
p_df = spark.read.parquet("path_to/data-0000.parquet")
p_df = spar.read.option("basePath", "~/files").parquet(*["~/files/folder1/*.parquet", "~/files/folder2/file1.parquet"])
pd_df = j_df.toPandas()
df = p_df.where("department=='HR'")
df = df.filter(df.name.isin(["Sudheer", "Sundeep"]))
p_df.drop("temp_timestamp",)
df.withColumn("User Since", df.updated - df.created)
p_df.groupby("department").count().orderBy("department")
p_df.write.parquet("path_to_save_folder/")
# with coalesce
p_df.coalesce(100).write.format("parquet").save("path_to_save_folder/", header="true")
# append, not overwrite
p_df.write.mode("append").parquet("save_path")
# save across partitions
p_df.write.partitionBy("department").parquet("save_path")