How to convert String to JSON in Spark SQL? - sql

I was working with the "Delta Logs" of Delta Table and the data of Delta table was stored in the Azure Blob Storage.
I used the below query to fetch the JSON data of Delta Log:
SELECT * FROM json.`/mnt/blob/deltaTables/employees/_delta_log/00000000000000000000.json`
I was able to fetch the JSON data as a table in the Databricks notebook (screenshot below):
Another screenshot for add.stats
Now I traverse through the JSON data in the query itself:
SELECT add.stats FROM json.`/mnt/blob/deltaTables/employees/_delta_log/00000000000000000000.json`
But after the stats the JSON data is in the string format and I am not able to traverse through further.
I am attaching the JSON file here for reference:
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"d21b496a-7282-49c9-a71c-3013d780fbeb","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"Id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"city\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"department\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"degree\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"workingLocation\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxEducation\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"experience\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"jobRole\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1664788829119}}
{"add":{"path":"part-00000-91e882d3-f9bc-481f-ba50-a7d061040401-c000.snappy.parquet","partitionValues":{},"size":6337133,"modificationTime":1664788872000,"dataChange":true,"stats":"{\"numRecords\":136251,\"minValues\":{\"Id\":1,\"Name\":\"Robin\",\"city\":\"Jhunjhunu\",\"age\":22,\"department\":\"Data Integration\",\"degree\":\"bsc\",\"workingLocation\":\"Jaipur\",\"maxEducation\":\"Graduation\",\"experience\":2,\"jobRole\":\"Data Engineer\"},\"maxValues\":{\"Id\":136251,\"Name\":\"Robin99999\",\"city\":\"Jhunjhunu99999\",\"age\":136272,\"department\":\"Data Integration99999\",\"degree\":\"bsc99999\",\"workingLocation\":\"Jaipur99999\",\"maxEducation\":\"Graduation99999\",\"experience\":136252,\"jobRole\":\"Data Engineer99999\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00001-25af3704-bc8d-4201-bbe8-6e6b07864e40-c000.snappy.parquet","partitionValues":{},"size":5969233,"modificationTime":1664788870000,"dataChange":true,"stats":"{\"numRecords\":127898,\"minValues\":{\"Id\":136252,\"Name\":\"Robin136251\",\"city\":\"Jhunjhunu136251\",\"age\":136273,\"department\":\"Data Integration136251\",\"degree\":\"bsc136251\",\"workingLocation\":\"Jaipur136251\",\"maxEducation\":\"Graduation136251\",\"experience\":136253,\"jobRole\":\"Data Engineer136251\"},\"maxValues\":{\"Id\":264149,\"Name\":\"Robin264148\",\"city\":\"Jhunjhunu264148\",\"age\":264170,\"department\":\"Data Integration264148\",\"degree\":\"bsc264148\",\"workingLocation\":\"Jaipur264148\",\"maxEducation\":\"Graduation264148\",\"experience\":264150,\"jobRole\":\"Data Engineer264148\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000001","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00002-0fbaae5e-371e-45ea-b2ce-b959248ba88a-c000.snappy.parquet","partitionValues":{},"size":5961349,"modificationTime":1664788871000,"dataChange":true,"stats":"{\"numRecords\":127898,\"minValues\":{\"Id\":264150,\"Name\":\"Robin264149\",\"city\":\"Jhunjhunu264149\",\"age\":264171,\"department\":\"Data Integration264149\",\"degree\":\"bsc264149\",\"workingLocation\":\"Jaipur264149\",\"maxEducation\":\"Graduation264149\",\"experience\":264151,\"jobRole\":\"Data Engineer264149\"},\"maxValues\":{\"Id\":392047,\"Name\":\"Robin392046\",\"city\":\"Jhunjhunu392046\",\"age\":392068,\"department\":\"Data Integration392046\",\"degree\":\"bsc392046\",\"workingLocation\":\"Jaipur392046\",\"maxEducation\":\"Graduation392046\",\"experience\":392048,\"jobRole\":\"Data Engineer392046\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000002","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00003-1c8ac662-7680-42b8-8ffb-eef96ad37085-c000.snappy.parquet","partitionValues":{},"size":5961263,"modificationTime":1664788862000,"dataChange":true,"stats":"{\"numRecords\":127898,\"minValues\":{\"Id\":392048,\"Name\":\"Robin392047\",\"city\":\"Jhunjhunu392047\",\"age\":392069,\"department\":\"Data Integration392047\",\"degree\":\"bsc392047\",\"workingLocation\":\"Jaipur392047\",\"maxEducation\":\"Graduation392047\",\"experience\":392049,\"jobRole\":\"Data Engineer392047\"},\"maxValues\":{\"Id\":519945,\"Name\":\"Robin519944\",\"city\":\"Jhunjhunu519944\",\"age\":519966,\"department\":\"Data Integration519944\",\"degree\":\"bsc519944\",\"workingLocation\":\"Jaipur519944\",\"maxEducation\":\"Graduation519944\",\"experience\":519946,\"jobRole\":\"Data Engineer519944\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000003","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00004-5c355575-1751-464f-93a5-672ae1f60c29-c000.snappy.parquet","partitionValues":{},"size":5990168,"modificationTime":1664788867000,"dataChange":true,"stats":"{\"numRecords\":127898,\"minValues\":{\"Id\":519946,\"Name\":\"Robin519945\",\"city\":\"Jhunjhunu519945\",\"age\":519967,\"department\":\"Data Integration519945\",\"degree\":\"bsc519945\",\"workingLocation\":\"Jaipur519945\",\"maxEducation\":\"Graduation519945\",\"experience\":519947,\"jobRole\":\"Data Engineer519945\"},\"maxValues\":{\"Id\":647843,\"Name\":\"Robin647842\",\"city\":\"Jhunjhunu647842\",\"age\":647864,\"department\":\"Data Integration647842\",\"degree\":\"bsc647842\",\"workingLocation\":\"Jaipur647842\",\"maxEducation\":\"Graduation647842\",\"experience\":647844,\"jobRole\":\"Data Engineer647842\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000004","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00005-fac91d9b-9528-4381-8983-419261e7b6ba-c000.snappy.parquet","partitionValues":{},"size":5962518,"modificationTime":1664788865000,"dataChange":true,"stats":"{\"numRecords\":127898,\"minValues\":{\"Id\":647844,\"Name\":\"Robin647843\",\"city\":\"Jhunjhunu647843\",\"age\":647865,\"department\":\"Data Integration647843\",\"degree\":\"bsc647843\",\"workingLocation\":\"Jaipur647843\",\"maxEducation\":\"Graduation647843\",\"experience\":647845,\"jobRole\":\"Data Engineer647843\"},\"maxValues\":{\"Id\":775741,\"Name\":\"Robin775740\",\"city\":\"Jhunjhunu775740\",\"age\":775762,\"department\":\"Data Integration775740\",\"degree\":\"bsc775740\",\"workingLocation\":\"Jaipur775740\",\"maxEducation\":\"Graduation775740\",\"experience\":775742,\"jobRole\":\"Data Engineer775740\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000005","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00006-41366707-d8cc-1 QQQQQQQAQQQQQQQQQQ46d2-b781-4d9bcc5b5210-c000.snappy.parquet","partitionValues":{},"size":5967367,"modificationTime":1664788866000,"dataChange":true,"stats":"{\"numRecords\":127898,\"minValues\":{\"Id\":775742,\"Name\":\"Robin775741\",\"city\":\"Jhunjhunu775741\",\"age\":775763,\"department\":\"Data Integration775741\",\"degree\":\"bsc775741\",\"workingLocation\":\"Jaipur775741\",\"maxEducation\":\"Graduation775741\",\"experience\":775743,\"jobRole\":\"Data Engineer775741\"},\"maxValues\":{\"Id\":903639,\"Name\":\"Robin903638\",\"city\":\"Jhunjhunu903638\",\"age\":903660,\"department\":\"Data Integration903638\",\"degree\":\"bsc903638\",\"workingLocation\":\"Jaipur903638\",\"maxEducation\":\"Graduation903638\",\"experience\":903640,\"jobRole\":\"Data Engineer903638\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000006","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00007-8c0a3670-cb24-4cdc-ac37-a9f173c7fe71-c000.snappy.parquet","partitionValues":{},"size":4498552,"modificationTime":1664788864000,"dataChange":true,"stats":"{\"numRecords\":96361,\"minValues\":{\"Id\":903640,\"Name\":\"Robin903639\",\"city\":\"Jhunjhunu903639\",\"age\":903661,\"department\":\"Data Integration903639\",\"degree\":\"bsc903639\",\"workingLocation\":\"Jaipur903639\",\"maxEducation\":\"Graduation903639\",\"experience\":903641,\"jobRole\":\"Data Engineer903639\"},\"maxValues\":{\"Id\":1000000,\"Name\":\"Robin999999\",\"city\":\"Jhunjhunu999999\",\"age\":1000021,\"department\":\"Data Integration999999\",\"degree\":\"bsc999999\",\"workingLocation\":\"Jaipur999999\",\"maxEducation\":\"Graduation999999\",\"experience\":1000001,\"jobRole\":\"Data Engineer999999\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}","tags":{"INSERTION_TIME":"1664788862000007","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1664788877620,"userId":"2102279527814428","userName":"robin30121999#gmail.com","operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"notebook":{"notebookId":"2204946593481188"},"clusterId":"1003-090126-rztrsnm4","isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numFiles":"8","numOutputRows":"1000000","numOutputBytes":"46647583"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"17e89e4f-380f-4888-9aaf-2ada42e13b8c"}}
Ultimately, I want to fetch the maximum value of experience using SQL

Starting with the example "stats" column of type string which you have...
from pyspark.sql import functions as F
df = spark.createDataFrame(
[("{\"numRecords\":136251,\"minValues\":{\"Id\":1,\"Name\":\"Robin\",\"city\":\"Jhunjhunu\",\"age\":22,\"department\":\"Data Integration\",\"degree\":\"bsc\",\"workingLocation\":\"Jaipur\",\"maxEducation\":\"Graduation\",\"experience\":2,\"jobRole\":\"Data Engineer\"},\"maxValues\":{\"Id\":136251,\"Name\":\"Robin99999\",\"city\":\"Jhunjhunu99999\",\"age\":136272,\"department\":\"Data Integration99999\",\"degree\":\"bsc99999\",\"workingLocation\":\"Jaipur99999\",\"maxEducation\":\"Graduation99999\",\"experience\":136252,\"jobRole\":\"Data Engineer99999\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}",),
("{\"numRecords\":127898,\"minValues\":{\"Id\":136252,\"Name\":\"Robin136251\",\"city\":\"Jhunjhunu136251\",\"age\":136273,\"department\":\"Data Integration136251\",\"degree\":\"bsc136251\",\"workingLocation\":\"Jaipur136251\",\"maxEducation\":\"Graduation136251\",\"experience\":136253,\"jobRole\":\"Data Engineer136251\"},\"maxValues\":{\"Id\":264149,\"Name\":\"Robin264148\",\"city\":\"Jhunjhunu264148\",\"age\":264170,\"department\":\"Data Integration264148\",\"degree\":\"bsc264148\",\"workingLocation\":\"Jaipur264148\",\"maxEducation\":\"Graduation264148\",\"experience\":264150,\"jobRole\":\"Data Engineer264148\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}",),
("{\"numRecords\":127898,\"minValues\":{\"Id\":264150,\"Name\":\"Robin264149\",\"city\":\"Jhunjhunu264149\",\"age\":264171,\"department\":\"Data Integration264149\",\"degree\":\"bsc264149\",\"workingLocation\":\"Jaipur264149\",\"maxEducation\":\"Graduation264149\",\"experience\":264151,\"jobRole\":\"Data Engineer264149\"},\"maxValues\":{\"Id\":392047,\"Name\":\"Robin392046\",\"city\":\"Jhunjhunu392046\",\"age\":392068,\"department\":\"Data Integration392046\",\"degree\":\"bsc392046\",\"workingLocation\":\"Jaipur392046\",\"maxEducation\":\"Graduation392046\",\"experience\":392048,\"jobRole\":\"Data Engineer392046\"},\"nullCount\":{\"Id\":0,\"Name\":0,\"city\":0,\"age\":0,\"department\":0,\"degree\":0,\"workingLocation\":0,\"maxEducation\":0,\"experience\":0,\"jobRole\":0}}",)],
["stats"])
You can use from_json (providing schema path to the object that you need ("experience")) to extract that object together with the structure leading to the object. Then you can remove the structure by just telling the struct path to it (F.col("exp.maxValues.experience")).
df = df.withColumn("exp", F.from_json("stats", "maxValues struct<experience:long>"))
df = df.withColumn("exp", F.col("exp.maxValues.experience"))
df.show()
# +--------------------+------+
# | stats| exp|
# +--------------------+------+
# |{"numRecords":136...|136252|
# |{"numRecords":127...|264150|
# |{"numRecords":127...|392048|
# +--------------------+------+
In SQL you could do it like this:
spark.sql("""
SELECT from_json(stats, 'maxValues struct<experience:long>').maxValues.experience as exp
FROM df
""").show()
# +------+
# | exp|
# +------+
# |136252|
# |264150|
# |392048|
# +------+

Related

How do I replace column after encrypting it by using Spark (PySpark)?

I have a question about replacing personal information to encrypted data using Spark.
Let's say for example, if I have a table like:
std_name
phone_number
John
585-1243-2156
Susan
585-4567-2156
I want to change phone_number to encrypted form like:
std_name
phone_number
John
avawehna'vqqa
Susan
vabdsvwegq'qb
I have tried using withColumn with udf, but it does not work well.
Can someone help me out?
You haven't provided your encryption function, but I will assume that there was something simple wrong. If you create a UDF, it will be separately run for every row, so you can use Python inside your UDF.
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('John', '585-1243-2156'),
('Susan', '585-4567-2156')],
['std_name', 'phone_number']
)
#F.udf
def encrypting(data):
# Encrypting logic:
encrypted_data = 'xyz' + data[::-1].replace('-', 'w')
return encrypted_data
df = df.withColumn('phone_number', encrypting('phone_number'))
df.show()
# +--------+----------------+
# |std_name| phone_number|
# +--------+----------------+
# | John|xyz6512w3421w585|
# | Susan|xyz6512w7654w585|
# +--------+----------------+

PySpark: Transform values of given column in the DataFrame

I am new to PySpark and Spark in general.
I would like to apply transformation on a given column in the DataFrame, essentially call a function for each value on that specific column.
I have my DataFrame df that looks like this:
df.show()
+------------+--------------------+
|version | body |
+------------+--------------------+
| 1|9gIAAAASAQAEAAAAA...|
| 2|2gIAAAASAQAEAAAAA...|
| 3|3gIAAAASAQAEAAAAA...|
| 1|7gIAKAASAQAEAAAAA...|
+------------+--------------------+
I need to read value of body column for each row where the version is 1 and then decrypt it (I have my own logic/function which takes a string and returns a decrypted string). Finally, write the decrypted values in csv format to a S3 bucket.
def decrypt(encrypted_string: str):
# code that returns decrypted string
So, When I do following, I get the corresponding filtered values to which I need to apply my decrypt function.
df.where(col('version') =='1')\
.select(col('body')).show()
+--------------------+
| body|
+--------------------+
|9gIAAAASAQAEAAAAA...|
|7gIAKAASAQAEAAAAA...|
+--------------------+
However, I am not clear how to do that. I tried to use collect() but then it defeats the purpose of using Spark.
I also tried using .rdd.map as follows but that did not work.
df.where(col('version') =='1')\
.select(col('body'))\
.rdd.map(lambda x: decrypt).toDF().show()
OR
.rdd.map(decrypt).toDF().show()
Could someone please help with this.
Please try:
from pyspark.sql.functions import udf
decrypt_udf = udf(decrypt, StringType())
df.where(col('version') =='1').withColumn('body', decrypt_udf('body'))
Got some clue from this post: Pyspark DataFrame UDF on Text Column.
Looks like I can simply get it with following. I was doing it without using udf earlier, so it wasn't working.
dummy_function_udf = udf(decrypt, StringType())
df.where(col('version') == '1')\
.select(col('body')) \
.withColumn('decryptedBody', dummy_function_udf('body')) \
.show()

How to convert pyspark dataframe to JSON?

I have pyspark dataframe and i want to convert it into list which contain JSON object.
For that i have done like below..
df.toJSON().collect()
But this operation send data to driver which is costly and take to much time to perform.And my dataframe contain millions of records.So is there any another way to do it without collect() operation which is optimized than collect().
Below is my dataframe df:-
product cost
pen 10
book 40
bottle 80
glass 55
and output is like below :-
df2 = [{product:'pen',cost:40},{product:'book',cost:40},{product:'bottle',cost:80},{product:'glass',cost:55}]
when i print the datatype of df2 it will be list.
If you want to create json object in dataframe then use collect_list + create_map + to_json functions.
(or)
To write as json document to the file then won't use to_json instead use .write.json()
Create JSON object:
df.agg(collect_list(create_map(lit("product"),"product",lit("cost"),"cost")).alias("stru")).\
selectExpr("to_json(stru) as json").\
show(10,False)
#+-------------------------------------------------------------------------------------------------------------------------------+
#|json |
#+-------------------------------------------------------------------------------------------------------------------------------+
#|[{"product":"pen","cost":"10"},{"product":"book","cost":"40"},{"product":"bottle","cost":"80"},{"product":"glass","cost":"55"}]|
#+-------------------------------------------------------------------------------------------------------------------------------+
#write to hdfs use .saveAsTextFile
df.agg(collect_list(create_map(lit("product"),"product",lit("cost"),"cost")).alias("stru")).selectExpr("to_json(stru) as json").rdd.map(lambda x:x['json']).saveAsTextFile("<path>")
#cat part-00000
#[{"product":"pen","cost":"10"},{"product":"book","cost":"40"},{"product":"bottle","cost":"80"},{"product":"glass","cost":"55"}]
Create JSON file:
df.agg(collect_list(create_map(lit("product"),"product",lit("cost"),"cost")).alias("stru")).write.mode("overwrite").json("<path>")
#cat part-00000-3a19165e-219e-4485-adb8-ef91589d6e31-c000.json
#{"stru":[{"product":"pen","cost":"10"},{"product":"book","cost":"40"},{"product":"bottle","cost":"80"},{"product":"glass","cost":"55"}]}

Writing a dataframe to CSV where lists are converted to JSON arrays

How would a list of strings be outputted as a JSON array when writing a dataframe to CSV?
E.g. ['foo', 'bar'] should be ["foo", "bar"]
Background
I'm copying data from a PostgreSQL DB on AWS RDS to AWS Redshift. As an intermediate step, the data has to be uploaded to AWS S3 in CSV files.
But Redshift doesn't support arrays as a datatype. Arrays need to be converted to a varchar representing a JSON array.
For example, a column of type character varying(255)[] on RDS would need a column of type such as character varying(MAX) on Redshift, and use JSON functions to interact with the data.
If the data isn't loaded into Redshift as a JSON array, it won't be valid.
json_arrays | is_valid_json_array
------------------------------+---------------------
[] | T
["a","b"] | T
["a",["b",1,["c",2,3,null]]] | T
{"a":1} | F
a | F
{foo, bar} | F
{"one", "two"} | F
[x,y,z] | F
[1,2,] | F
['x','y','z'] | F
The tricky part is that Python represents strings with single quotes internally, so when you write a list of strings to a CSV, that list will use single quotes, which isn't a valid JSON array.
An unsuccessful approach was to transform the array when reading the CSV.
def convert_pg_array_to_json_array(a):
"""
Converts a PG array such as '{foo,bar}' to '["foo", "bar"]'
"""
return json.dumps(a[1:-1].split(','))
# The arrays to convert are in column 20
df = pandas.read_csv(path, converters={20: convert_pg_array_to_json_array})
# Array gets output as "[""foo"", ""bar""]" which is not a valid JSON array
# Desired output is ["foo", "bar"]
df.to_csv(path)
Combine the approach in the question with these changes when writing the CSV:
Configure an escape character (typically backslash \) and disable double quoting.
df.to_csv(path, escapechar="\\", doublequote=False)
The row in the CSV will look like [\"foo\", \"bar\"] which is a valid JSON if you load the data with escaping the backslashes. For a Redshift COPY FROM you'll need to add the ESCAPE option to the query.

SQL on Spark: How do I get all values of DISTINCT?

So, assume I have the following table:
Name | Color
------------------------------
John | Blue
Greg | Red
John | Yellow
Greg | Red
Greg | Blue
I would like to get a table of the distinct colors for each name - how many and their values. Meaning, something like this:
Name | Distinct | Values
--------------------------------------
John | 2 | Blue, Yellow
Greg | 2 | Red, Blue
Any ideas how to do so?
collect_list will give you a list without removing duplicates.
collect_set will automatically remove duplicates
so just
select
Name,
count(distinct color) as Distinct, # not a very good name
collect_set(Color) as Values
from TblName
group by Name
this feature is implemented since spark 1.6.0 check it out:
https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
/**
* Aggregate function: returns a set of objects with duplicate elements eliminated.
*
* For now this is an alias for the collect_set Hive UDAF.
*
* #group agg_funcs
* #since 1.6.0
*/
def collect_set(columnName: String): Column = collect_set(Column(columnName))
For PySPark; I come from an R/Pandas background, so I'm actually finding Spark Dataframes a little easier to work with.
To do this:
Setup a Spark SQL context
Read your file into a dataframe
Register your dataframe as a temp table
Query it directly using SQL syntax
Save results as objects, output to files..do your thing
Here's a class I created to do this:
class SQLspark():
def __init__(self, local_dir='./', hdfs_dir='/users/', master='local', appname='spark_app', spark_mem=2):
self.local_dir = local_dir
self.hdfs_dir = hdfs_dir
self.master = master
self.appname = appname
self.spark_mem = int(spark_mem)
self.conf = (SparkConf()
.setMaster(self.master)
.setAppName(self.appname)
.set("spark.executor.memory", self.spark_mem))
self.sc = SparkContext(conf=self.conf)
self.sqlContext = SQLContext(self.sc)
def file_to_df(self, input_file):
# import file as dataframe, all cols will be imported as strings
df = self.sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", "\t").option("inferSchema", "true").load(input_file)
# # cache df object to avoid rebuilding each time
df.cache()
# register as temp table for querying, use 'spark_df' as table name
df.registerTempTable("spark_df")
return df
# you also cast a spark dataframe as a pandas df
def sparkDf_to_pandasDf(self, input_df):
pandas_df = input_df.toPandas()
return pandas_df
def find_distinct(self, col_name):
my_query = self.sqlContext.sql("""SELECT distinct {} FROM spark_df""".format(col_name))
# now do your thing with the results etc
my_query.show()
my_query.count()
my_query.collect()
###############
if __name__ == '__main__':
# instantiate class
# see function for variables to input
spark = TestETL(os.getcwd(), 'hdfs_loc', "local", "etl_test", 10)
# specify input file to process
tsv_infile = 'path/to/file'