How to load csv file in spark in right way? - apache-spark-sql
I am loading csv file in spark but it consists of null values bcz of that arrAy index out of bound exception occurs how to load it?
val schema = StructType (Array(StructField("ROW_ID",StringType,true) ,StructField("CREATED",TimestampType,true) ,StructField("CREATED_BY", StringType, true) ,StructField("LAST_UPD",TimestampType,true) ,StructField("LAST_UPD_BY", StringType, true) ,StructField("MODIFICATION_NUM", StringType, true) ,StructField("CONFLICT_ID",StringType, true) ,StructField("ACTIVE_FLG", StringType, true) ,StructField("ALW_PART_SHIP_FLG", StringType, true) ,StructField("APPROVED_FLG", StringType, true) ,StructField("AUTO_RECV_FLG", StringType, true) ,StructField("BILLABLE_FLG", StringType, true) ,StructField("BU_ID", StringType, true) ,StructField("CRDT_CHK_PASSD_FLG", StringType, true) ,StructField("DISPLAY_LINE_FLG", StringType, true) ,StructField("DOCNUM_GNRTD_FLG", StringType, true) ,StructField("EXMPT_APP_FLG", StringType, true) ,StructField("EXP_TO_ICTXN_FLG", StringType, true) ,StructField("FREEZE_FLG", StringType, true) ,StructField("FULFIL_LOCKED_FLG", StringType, true) ,StructField("HOLD_FLG", StringType, true) ,StructField("MANUAL_FLG", StringType, true) ,StructField("NEED_NETCHANGE_FLG", StringType, true) ,StructField("ORDER_CAT_CD", StringType, true) ,StructField("ORDER_NUM", StringType, true) ,StructField("ORDER_TYPE_ID", StringType, true) ,StructField("PR_REP_DNRM_FLG", StringType, true) ,StructField("PR_REP_MANL_FLG", StringType, true) ,StructField("PR_REP_SYS_FLG", StringType, true) ,StructField("REV_NUM", StringType, true) ,StructField("STATUS_CHG_FLG", StringType, true) ,StructField("TAX_EXEMPT_FLG", StringType, true) ,StructField("TEST_ORDER_FLG", StringType, true) ,StructField("TRACK_REV_FLG", StringType, true) ,StructField("ASGN_TS", StringType, true) ,StructField("CHRG_CREATED_FLG", StringType, true) ,StructField("CMPNS_STATUS_DT", StringType, true) ,StructField("CRCHK_ANN_INCOME", StringType, true) ,StructField("CRCHK_CRDT_SCORE", StringType, true) ,StructField("CRCHK_DATE", StringType, true) ,StructField("CRCHK_PSTPAID_SVC", StringType, true) ,StructField("CRDT_ASGN_TS", StringType, true) ,StructField("CRDT_CRD_EXP_DT", StringType, true) ,StructField("CRDT_CRD_TXN_AMT", StringType, true) ,StructField("CRDT_CRD_TXN_DT", StringType, true) ,StructField("DB_LAST_UPD", TimestampType, true) ,StructField("DISCNT_AMT",StringType, true) ,StructField("DISCNT_PERCENT", StringType, true) ,StructField("DISCNT_RC_AMT", StringType, true) ,StructField("DISCNT_RC_PCT", StringType, true) ,StructField("EAI_SYNC_DT", StringType, true) ,StructField("FRGHT_AMT", StringType, true) ,StructField("FRGHT_AMT_DT", StringType, true) ,StructField("INIT_APRV_AMT", StringType, true) ,StructField("LAST_CANCEL_DT", StringType, true) ,StructField("LOAD_NUM",StringType, true ) ,StructField("ORDER_DT", TimestampType, true) ,StructField("ORDER_EXCH_DT", TimestampType, true) ,StructField("PRICING_DT", TimestampType, true) ,StructField("PURCH_ORD_EXP_TS", StringType, true) ,StructField("REQ_SHIP_DT", TimestampType, true) ,StructField("REVISION_DT", StringType, true) ,StructField("STATUS_DT",TimestampType, true) ,StructField("TAX_AMT", StringType, true) ,StructField("TAX_AMT_EXCH_DT", StringType, true) ,StructField("TAX_PERCENT", StringType, true) ,StructField("TOTAL_AMT", StringType, true) ,StructField("TOT_EXTND_PRICE", StringType, true) ,StructField("TOT_EXTND_TAX", StringType, true) ,StructField("TOT_QTY_BONUS", StringType, true) ,StructField("TOT_QTY_SHIP", StringType, true) ,StructField("ACCNT_ADDR_ID", StringType, true) ,StructField("ACCNT_ID", StringType, true) ,StructField("ACCNT_ORDER_NUM", StringType, true) ,StructField("AGREE_BNFT_ID", StringType, true) ,StructField("AGREE_ID", StringType, true) ,StructField("APPR_BY_EMP_ID", StringType, true) ,StructField("APPR_BY_POSTN_ID", StringType, true) ,StructField("BILL_ACCNT_ID", StringType, true) ,StructField("BILL_PROFILE_ID", StringType, true) ,StructField("BLOCK_BL_CD", StringType, true) ,StructField("BLOCK_DLVRY_CD", StringType, true) ,StructField("BL_ADDR_ID", StringType, true) ,StructField("BL_CON_ID", StringType, true) ,StructField("BL_OU_ID", StringType, true) ,StructField("BL_PER_ADDR_ID", StringType, true) ,StructField("CAMP_CON_ID", StringType, true) ,StructField("CARRIER_CD", StringType, true) ,StructField("CARRIER_PRIO_CD", StringType, true) ,StructField("CCNUM_ENCRPKEY_REF", StringType, true) ,StructField("CCVNUM_ENCRPKY_REF", StringType, true) ,StructField("CCV_NUMBER", StringType, true) ,StructField("CC_NUMBER", StringType, true) ,StructField("CC_TXNPROC_AC_NUM",StringType, true) ,StructField("CC_TXNPROC_VNDR_ID",StringType, true) ,StructField("CC_TXN_RTRN_MSG_CD",StringType, true) ,StructField("CMPND_PROD_NUM",StringType, true) ,StructField("CMPNS_STATUS_CD",StringType, true) ,StructField("COMMIT_TYPE_CD", StringType, true) ,StructField("COMPOUND_PROD_NUM", StringType, true) ,StructField("CONTACT_ID", StringType, true) ,StructField("CO_BUS_AREA_ID", StringType, true) ,StructField("CRCHK_CRDT_AGENCY", StringType, true) ,StructField("CRCHK_DECISION_CD", StringType, true) ,StructField("CRCHK_IDENTIFIER", StringType, true) ,StructField("CRCHK_ID_TYPE_CD", StringType, true) ,StructField("CRCHK_UPD_BY", StringType, true) ,StructField("CRDHOLDER_NAME", StringType, true) ,StructField("CRDTCD_TXN_STAT_CD", StringType, true) ,StructField("CRDT_CRD_APPR_CD", StringType, true) ,StructField("CRDT_CRD_EXP_MO_CD", StringType, true) ,StructField("CRDT_CRD_EXP_YR_CD", StringType, true) ,StructField("CRDT_CRD_NAME", StringType, true) ,StructField("CRDT_STATUS_CD", StringType, true) ,StructField("CURCY_CD", StringType, true) ,StructField("CUSTOMER_ID", StringType, true) ,StructField("DB_LAST_UPD_SRC", StringType, true) ,StructField("DCP_ID", StringType, true) ,StructField("DEST_INVLOC_ID", StringType, true) ,StructField("DLVRY_PERIOD_ID", StringType, true) ,StructField("DLVRY_STATUS_CD", StringType, true) ,StructField("EAI_EXPRT_STAT_CD", StringType, true) ,StructField("EAI_ORDER_NUM", StringType, true) ,StructField("ENTLMNT_ID", StringType, true) ,StructField("EVT_SRC_ID", StringType, true) ,StructField("FRGHT_AMT_CURCY_CD", StringType, true) ,StructField("FRGHT_TERMS_CD", StringType, true) ,StructField("FRGHT_TERMS_INFO", StringType, true) ,StructField("INTEGRATION_ID", StringType, true) ,StructField("LOY_MEMBER_ID", StringType, true) ,StructField("LOY_PROMO_ID", StringType, true) ,StructField("OPTY_ID", StringType, true) ,StructField("PAR_ORDER_ID", StringType, true) ,StructField("PAYMENT_TERM_ID", StringType, true) ,StructField("PAYMENT_TYPE_CD", StringType, true) ,StructField("PAYTO_ADDR_ID", StringType, true) ,StructField("PAYTO_CON_ID", StringType, true) ,StructField("PAYTO_OU_ID", StringType, true) ,StructField("PAY_AUTH_NUM", StringType, true) ,StructField("PAY_OU_ID", StringType, true) ,StructField("PEC_WF_PROC_NAME", StringType, true) ,StructField("PRIO_CD", StringType, true) ,StructField("PRI_LST_ID", StringType, true) ,StructField("PRI_WF_PROC_NAME", StringType, true) ,StructField("PROJ_ID", StringType, true) ,StructField("PROMO_DCP_ID", StringType, true) ,StructField("PROMO_ID", StringType, true) ,StructField("PRSP_CONTACT_ID", StringType, true) ,StructField("PR_PAYMENT_ID", StringType, true) ,StructField("PR_POSTN_ID", StringType, true) ,StructField("PR_SHIPMENT_ID", StringType, true) ,StructField("QUOTE_ID", StringType, true) ,StructField("RTRN_ADDR_ID", StringType, true) ,StructField("RTRN_CON_ID", StringType, true) ,StructField("RTRN_OU_ID", StringType, true) ,StructField("RTRN_REASON_CD", StringType, true) ,StructField("SERV_ACCNT_ID", StringType, true) ,StructField("SHIP_ADDR_ID", StringType, true) ,StructField("SHIP_CON_ID", StringType, true) ,StructField("SHIP_METH_CD", StringType, true) ,StructField("SHIP_OU_ID", StringType, true) ,StructField("SHIP_PER_ADDR_ID", StringType, true) ,StructField("SLS_HIER_VER_ID", StringType, true) ,StructField("SRC_INVLOC_ID", StringType, true) ,StructField("SRV_PROV_OU_ID", StringType, true) ,StructField("SR_ID", StringType, true) ,StructField("STATUS_CD", StringType, true) ,StructField("TAX_AMT_CURCY_CD", StringType, true) ,StructField("TAX_EXEMPT_NUM", StringType, true) ,StructField("TAX_EXEMPT_REASON", StringType, true) ,StructField("TAX_LIST_ID", StringType, true) ,StructField("VALIDATION_RULE_ID", StringType, true) ,StructField("X_IPTV_FLG", StringType, true) ,StructField("X_TRIAL_FLG", StringType, true) ,StructField("X_ACC_CAT", StringType, true) ,StructField("X_HSBA_BTU_FLG", StringType, true) ,StructField("X_NT_ORDR", StringType, true) ,StructField("X_COMMISSION_TO_ID", StringType, true) ,StructField("X_COMMISSION_TO_NAME", StringType, true) ,StructField("X_AE_ID", StringType, true) ,StructField("X_CUST_ABBR", StringType, true) ,StructField("X_URGENT_FLG", StringType, true) ,StructField("DOC_NUM", StringType, true) ,StructField("CRDT_COMMENTS", StringType, true) ,StructField("COMMENTS", StringType, true) ,StructField("DESC_TEXT", StringType, true) ,StructField("DISCNT_REASON", StringType, true) ,StructField("SHIP_INSTRUCTIONS", StringType, true) ,StructField("HOLD_REASON", StringType, true) ,StructField("EAI_ERROR_TEXT", StringType, true)))
val rowRDD = order.map(_.split("#CO1D#", -1)).map(p => Row(p(0).trim, p(1).trim,p(2).trim,p(3).trim,p(4).trim,p(5).trim,p(6).trim,p(7).trim,p(8).trim,p(9).trim,p(10).trim,p(11).trim,p(12).trim,p(13).trim,p(14).trim,p(15).trim,p(16).trim,p(17).trim,p(18).trim,p(19).trim,p(20).trim,p(21).trim,p(22).trim,p(23).trim,p(24).trim,p(25).trim,p(26).trim,p(27).trim,p(28).trim,p(29).trim,p(30).trim,p(31).trim,p(32).trim,p(33).trim,p(34).trim,p(35).trim,p(36).trim,p(37).trim,p(38).trim,p(39).trim,p(40).trim,p(41).trim,p(42).trim,p(43).trim,p(44).trim,p(45).trim,p(46).trim,p(47).trim,p(48).trim,p(49).trim,p(50).trim,p(51).trim,p(52).trim,p(53).trim,p(54).trim,p(55).trim,p(56).trim,p(57).trim,p(58),p(59).trim,p(60).trim,p(61).trim,p(62).trim,p(63).trim,p(64).trim,p(65).trim,p(66).trim,p(67).trim,p(68).trim,p(69).trim,p(70).trim,p(71).trim,p(72).trim,p(73).trim,p(74).trim,p(75).trim,p(76).trim,p(77).trim,p(78).trim,p(79).trim,p(80).trim,p(81).trim,p(82).trim,p(83).trim,p(84).trim,p(85).trim,p(86).trim,p(87).trim,p(88).trim,p(89).trim,p(90).trim,p(91).trim,p(92).trim,p(93).trim,p(94).trim,p(95).trim,p(96).trim,p(97).trim,p(98).trim,p(99).trim,p(100).trim,p(101).trim,p(102).trim,p(103).trim,p(104).trim,p(105).trim,p(106).trim,p(107).trim,p(108).trim,p(109).trim,p(110).trim,p(111).trim,p(112).trim,p(113).trim,p(114).trim,p(115).trim,p(116).trim,p(117).trim,p(118).trim,p(119).trim,p(120).trim,p(121).trim,p(122).trim,p(123).trim,p(124).trim,p(125).trim,p(126).trim,p(127).trim,p(128).trim,p(129).trim,p(130).trim,p(131).trim,p(132).trim,p(133).trim,p(134).trim,p(135).trim,p(136).trim,p(137).trim,p(138).trim,p(139).trim,p(140).trim,p(141).trim,p(142).trim,p(143).trim,p(144).trim,p(145).trim,p(146).trim,p(147).trim,p(148).trim,p(149).trim,p(150).trim,p(151).trim,p(152).trim,p(153).trim,p(154).trim,p(155).trim,p(156).trim,p(157).trim,p(158).trim,p(159).trim,p(160).trim,p(161).trim,p(162).trim,p(163).trim,p(164).trim,p(165).trim,p(166).trim,p(167).trim,p(168).trim,p(169).trim,p(170).trim,p(171).trim,p(172).trim,p(173).trim,p(174).trim,p(175).trim,p(176).trim,p(177).trim,p(178).trim,p(179).trim,p(180).trim,p(181).trim,p(182).trim,p(183).trim,p(184).trim,p(185).trim,p(186).trim,p(187).trim,p(188).trim,p(189).trim))
val s_order = sqlContext.createDataFrame(rowRDD,schema)
s_order.registerTempTable("s_order")
error : array index bound out of exception
I think you can really simplify your "rowRDD" creation. The index out of bounds exceptions is probably because you have empty values at the end of your line, so you can just pad the array to a certain size after splitting.
Apart from that, since it seems like you are taking all of the fields after splitting, you can create your rowRDD like this (change the 190 to the correct length):
val rowRDD = order
.map{line =>
line.split("#CO1D#").map(_.trim).padTo(190, null)
}
.map(Row(_:_*))
Hope this helps.
EDIT
val intColumns = Set(1, 10, 15)
val tsColumns = Set(5, 15)
val rowRDD = order
.map{line =>
line.split("#CO1D#").map(_.trim).padTo(190, null)
}
.map(fields => {
val fieldsWithTypes = fields.zipWithIndex.map{ case (s,i) =>
if (intColumns.contains(i)) {
s.toInt
} else if (tsColumns.contains(i)) {
s.toLong
} else {
s
}
}
Row(fieldsWithTypes:_*)
})
The zipWithIndex method is not the most efficient way to iterate the array with index, but looks more convenient to me because of readability.
Related
I want to take user input in Spark dataframe but giving error ['list' object is not callable]
I can not take any inputI have a schema. schm = StructType([ StructField("ID", IntegerType(), True), StructField("fixed_credit_limit", IntegerType(), True), StructField("credit_limit", IntegerType(), True), StructField("due_amount", IntegerType(), True), StructField("created_date", StringType(), True), StructField("updated_date", StringType(), True), StructField("agent_name_id",IntegerType(), True), StructField("rfid_id", StringType(), True), ]) input=[(13158,100,100,0,'05/29/2021 11:01:31','05/29/2021 11:01:31',5,'862b4497-577f-47f9-8725-dd6c397ce408')] df1 = spark.createDataFrame(input, schema) I want to take the user input of agent_name_id but it gives the error ['list' object is not callable] how can I take the user input of agent_name_id.
Spark: How to define a nested schema?
I am new to Apache Spark, so forgive me if this is a noob question. I am trying to define a particular schema before reading in the dataset in order to speed up processing. There are a few data types that I am not sure how to define (ArrayType and StructType). Here is a screenshot of the schema I am working with: Here is what I have so far: jsonSchema = StructType([StructField("attribution", ArrayType(), True), StructField("averagingPeriod", StructType(), True), StructField("city", StringType(), True), StructField("coordinates", StructType(), True), StructField("country", StringType(), True), StructField("date", StructType(), True), StructField("location", StringType(), True), StructField("mobile", BooleanType(), True), StructField("parameter", StringType(), True), StructField("sourceName", StringType(), True), StructField("sourceType", StringType(), True), StructField("unit", StringType(), True), StructField("value", DoubleType(), True) ]) My question is: How do I account for the name and url under the attribution column, the unit and value under the averagingPeriod column, etc? For reference, here is the dataset I am using: https://registry.opendata.aws/openaq/.
Here's an example of array type and struct type. I think it should be straightforward to do this for all other columns. from pyspark.sql.types import * jsonSchema = StructType([ StructField("attribution", ArrayType(StructType([StructField("name", StringType()), StructField("url", StringType())])), True), StructField("averagingPeriod", StructType([StructField("unit", StringType()), StructField("value", DoubleType())]), True), # ... etc. ])
pyspark create dataframe from pandas with a column of list of tuples
I am trying to create a pyspark dataframe from pandas dataframe. import pandas as pd from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType a_dict = {0: [(0, 9.821), (1, 82.185)]} a_pd = pd.DataFrame.from_dict(a_dict.items()) a_pd.columns = ["row_num", "val"] a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)]) my_schema = StructType([ StructField("row_num", LongType(), True),StructField("val", list(a_str), True)]) # error a_df = spark.createDataFrame(a_pd, schema=my_schema) error: AssertionError: dataType [StructField(id,IntegerType,true), StructField(prob,DoubleType,true)] should be an instance of <class 'pyspark.sql.types.DataType'> How to define a valid schema of list of tuple of (int, DoubleType) so that it can be understood by pyspark? thanks
For a list of values, you have to use ArrayType. Below is your code reproduced with examples. import pandas as pd from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType a_dict = {0: [(0, 9.821), (1, 82.185)], 1: [(0, 9.821), (1, 8.10), (3, 2.385)], 2: [(0, 9.821), (1, 1.4485), (4, 5.15), (5, 6.104)]} a_pd = pd.DataFrame.from_dict(a_dict.items()) a_pd.columns = ["row_num", "val"] print(a_pd.head()) a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)]) my_schema = StructType([StructField("row_num", LongType(), True), StructField("val", ArrayType(a_str), True)]) # error a_df = sqlContext.createDataFrame(a_pd, schema=my_schema) print(a_df.show(truncate=False)) print(a_df.printSchema()) Output: +-------+------------------------------------------------+ |row_num|val | +-------+------------------------------------------------+ |0 |[[0, 9.821], [1, 82.185]] | |1 |[[0, 9.821], [1, 8.1], [3, 2.385]] | |2 |[[0, 9.821], [1, 1.4485], [4, 5.15], [5, 6.104]]| +-------+------------------------------------------------+ root |-- row_num: long (nullable = true) |-- val: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- id: integer (nullable = true) | | |-- prob: double (nullable = true)
how to change pyspark data frame column data type?
I'm looking for method to change pyspark dataframe column type from df.printSchema() To Thank you, for your help, in advance.
You have to replace the column with new schema. ArrayType take two parameters elementType and containsNull. from pyspark.sql.types import * from pyspark.sql.functions import udf x = [("a",["b","c","d","e"]),("g",["h","h","d","e"])] schema = StructType([StructField("key",StringType(), nullable=True), StructField("values", ArrayType(StringType(), containsNull=False))]) df = spark.createDataFrame(x,schema = schema) df.printSchema() new_schema = ArrayType(StringType(), containsNull=True) udf_foo = udf(lambda x:x, new_schema) df.withColumn("values",udf_foo("values")).printSchema() root |-- key: string (nullable = true) |-- values: array (nullable = true) | |-- element: string (containsNull = false) root |-- key: string (nullable = true) |-- values: array (nullable = true) | |-- element: string (containsNull = true)
Here is a useful example where you can change the schema for every column assuming you want the same type from pyspark.sql.types import Row from pyspark.sql.functions import * df = sc.parallelize([ Row(isbn=1, count=1, average=10.6666666), Row(isbn=2, count=1, average=11.1111111) ]).toDF() df.printSchema() df=df.select(*[col(x).cast('float') for x in df.columns]).printSchema() outputs: root |-- average: double (nullable = true) |-- count: long (nullable = true) |-- isbn: long (nullable = true) root |-- average: float (nullable = true) |-- count: float (nullable = true) |-- isbn: float (nullable = true)
Spark SQL Schema
I have this RDD in PySpark and i want to make the schema. Example of 1 row of RDD collected: (('16/12/2006', '17:24:00', 4.216, 0.418, 234.84, 18.4, 0.0, 1.0, 17.0), 0) customSchema = StructType([ StructField("Date", StringType(), True), StructField("Hour", StringType(), True), StructField("ActivePower", FloatType(), True), StructField("ReactivePower", FloatType(), True), StructField("Voltage", FloatType(), True), StructField("Instensity", FloatType(), True), StructField("Sub1", FloatType(), True), StructField("Sub2", FloatType(), True), StructField("Sub3", FloatType(), True), StructField("ID", IntegerType(), True)]) The problem is that the Index (last zero) is out of the tuple of data and I don't know how to make the schema correctly. Thank you in advance.
You're almost there. you just need another StructField: data = [ (('16/12/2006', '17:24:00', 4.216, 0.418, 234.84, 18.4, 0.0, 1.0, 17.0), 0) ] schema = StructType([ StructField("values", StructType([ StructField("Date", StringType(), True), StructField("Hour", StringType(), True), StructField("ActivePower", FloatType(), True), StructField("ReactivePower", FloatType(), True), StructField("Voltage", FloatType(), True), StructField("Instensity", FloatType(), True), StructField("Sub1", FloatType(), True), StructField("Sub2", FloatType(), True), StructField("Sub3", FloatType(), True), ])), StructField("ID", IntegerType(), True) ]) df = spark.createDataFrame(data, schema) df.printSchema() root |-- values: struct (nullable = true) | |-- Date: string (nullable = true) | |-- Hour: string (nullable = true) | |-- ActivePower: float (nullable = true) | |-- ReactivePower: float (nullable = true) | |-- Voltage: float (nullable = true) | |-- Instensity: float (nullable = true) | |-- Sub1: float (nullable = true) | |-- Sub2: float (nullable = true) | |-- Sub3: float (nullable = true) |-- ID: integer (nullable = true) df.show(1, False) +----------------------------------------------------------+---+ |values |ID | +----------------------------------------------------------+---+ |[16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0]|0 | +----------------------------------------------------------+---+