How to put hive table on complex nested XML file

How to put hive table on complex nested XML file - hive

I am trying to create a hive table on top of a complex XML file using hivexmlserde-1.0.5.3.jar. I can get it to work for XML in the examples on the internet, but I can't get it to work for my XML file which seems to be more complex than the examples I have found online.
Here is my XML file:
<?xml version="1.0"?>
<SSNExportDocument xmlns="urn:com:ssn:schema:export:SSNExportFormat.xsd" Version="0.1" DocumentID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e-2" ExportID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e" JobID="164771" RunID="3456662" CreationTime="2019-07-29T13:15:09.584-05:00" StartTime="2019-07-29T09:15:00.000-05:00" EndTime="2019-07-29T13:15:00.000-05:00">
<MeterData MeterName="50000010" UtilDeviceID="50000010" MacID="a0:06:5f:00:00:00:00:0a">
<IntervalReadData IntervalLength="15" StartTime="2019-07-29T08:00:00.000-05:00" EndTime="2019-07-29T12:00:00.000-05:00" NumberIntervals="16">
<Interval EndTime="2019-07-29T08:15:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:46.302-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="29">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
<Interval EndTime="2019-07-29T08:30:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:46.302-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="30">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
</IntervalReadData>
</MeterData>
<MeterData MeterName="50000022" UtilDeviceID="50000022" MacID="a0:06:5f:00:00:00:00:16">
<IntervalReadData IntervalLength="15" StartTime="2019-07-29T08:00:00.000-05:00" EndTime="2019-07-29T12:00:00.000-05:00" NumberIntervals="16">
<Interval EndTime="2019-07-29T08:15:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:49.324-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="29">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
<Interval EndTime="2019-07-29T08:30:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:49.324-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="30">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
</IntervalReadData>
</MeterData>
</SSNExportDocument>
I first found example code on http://dennysjymbo.blogspot.com/2018/05/using-xml-serde-in-hive-for-exploding.html which works fine, so I know that the serde is being found in the class path.
I've tried countless ways to define the table.
This is my latest attempt:
drop table if exists default.xmltest;
create external table default.xmltest(
MeterData array<
struct<MeterData:array<
struct<MeterName:string,UtilDeviceID:string,MacID:string,
IntervalReadData:struct<IntervalLength:string,StartTime:string,EndTime:string,NumberIntervals:string,
Interval:array<
struct<EndTime:string,GatewayCollectedTime:string,BlockSequenceNumber:string,IntervalSequenceNumber:string,
Reading:array<
struct<Channel:string,RawValue:string,Value:string,UOM:string,BlockEndValue:string
>>>>>>>>>)
row format serde 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
with serdeproperties ("column.xpath.MeterData" = "/SSNExportDocument/MeterData")
stored as inputformat 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
outputformat 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
location "/user/cyelve1/xmltest"
tblproperties ( "xmlinput.start" = "<SSNExportDocument>" ,"xmlinput.end" = "</SSNExportDocument>" );
NOTE: If I try to use the start tag that has attributes like I have seen in other examples like so:
tblproperties ( "xmlinput.start" = "<SSNExportDocument " ,"xmlinput.end" = "</SSNExportDocument>" )
I get a NULL returned. So, just for testing I changed the start tag
FROM
<SSNExportDocument xmlns="urn:com:ssn:schema:export:SSNExportFormat.xsd" Version="0.1" DocumentID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e-2" ExportID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e" JobID="164771" RunID="3456662" CreationTime="2019-07-29T13:15:09.584-05:00" StartTime="2019-07-29T09:15:00.000-05:00" EndTime="2019-07-29T13:15:00.000-05:00">
TO
<SSNExportDocument>
I am expecting a result like the one that worked in the example I used from the http://dennysjymbo.blogspot.com/2018/05/using-xml-serde-in-hive-for-exploding.html site:
[{"customerleveldata":{"survey_id":144434840,"client_id":6780,"service":"HH","recdate":"2018-04-02","disdate":"2018-01-01","analysis":[{"response":{"varname":"B2PR","value":"5"}},{"response":{"varname":"PI2PR","value":"5"}}],"demographics":[{"response":{"varname":"AGE","value":"90"}},{"response":{"varname":"CMSH_1","value":"Yes"}}],"hcahps":[{"response":{"varname":"CMSH_10","value":"Yes"}},{"response":{"varname":"CMSH_12","value":"Yes"}}]}}]
but this is my result:
[{"meterdata":[{"metername":null,"utildeviceid":null,"macid":null,"intervalreaddata":{"intervallength":null,"starttime":null,"endtime":"<string>2019-07-29T08:15:00.000-05:002019-07-29T08:30:00.000-05:00</string>","numberintervals":null,"interval":[{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]}]}}]},{"meterdata":[{"metername":null,"utildeviceid":null,"macid":null,"intervalreaddata":{"intervallength":null,"starttime":null,"endtime":"<string>2019-07-29T08:15:00.000-05:002019-07-29T08:30:00.000-05:00</string>","numberintervals":null,"interval":[{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]}]}}]}]

Related

XML parsing using SQL

How i can parse the following XML to get output like below name = Business, Value = XYZ name = Product, Value = STANDARD name = Trend, Value = Active,Active New,Sparse etc Business,Product,Trend data can be dynamic,some xml do contain, some do not,if it exists then only value should be shown.
Can you please help me with this.
<Filter>
<Expression>
<Expression Name="Business">
<Path>
<RolePathItem>
<RoleID>Gcea34afc</RoleID>
</RolePathItem>
</Path>
<AttributeRef>
<AttributeID>3d0534a20d19</AttributeID>
</AttributeRef>
</Expression>
<Expression>
<Literal>
<DataType>String</DataType>
<Value>XYZ</Value>
</Literal>
</Expression>
</Expression>
<Expression>
<Function>
<FunctionName>Equals</FunctionName>
<Arguments>
<Expression Name="Product">
<Path>
<RolePathItem>
<RoleID>6a99c8cd92fc</RoleID>
</RolePathItem>
<RolePathItem>
<RoleID>011e01b51ba0</RoleID>
</RolePathItem>
</Path>
</Expression>
<Expression>
<Literal>
<DataType>String</DataType>
<Value>STANDARD</Value>
</Literal>
</Expression>
</Arguments>
</Function>
</Expression>
<Expression>
<Function>
<FunctionName>In</FunctionName>
<Arguments>
<Expression Name="Trend">
<Path>
<RolePathItem>
<RoleID>6a99c8cd92fc</RoleID>
</RolePathItem>
<RolePathItem>
<RoleID>dad362a5a954</RoleID>
</RolePathItem>
</Path>
</Expression>
<Expression>
<Literal>
<DataType>String</DataType>
<Values>
<Value>Active</Value>
<Value>Active New</Value>
<Value>New</Value>
<Value>Sparse</Value>
</Values>
</Literal>
</Expression>
</Arguments>
</Function>
</Expression>
</Filter>

Because of dynamic content, I had to use some string functions, but it looks working.
DECLARE #xmlData XML = '
<Filter>
<Expression>
<Expression Name="Business">
<Path>
<RolePathItem>
<RoleID>Gcea34afc</RoleID>
</RolePathItem>
</Path>
<AttributeRef>
<AttributeID>3d0534a20d19</AttributeID>
</AttributeRef>
</Expression>
<Expression>
<Literal>
<DataType>String</DataType>
<Value>XYZ</Value>
</Literal>
</Expression>
</Expression>
<Expression>
<Function>
<FunctionName>Equals</FunctionName>
<Arguments>
<Expression Name="Product">
<Path>
<RolePathItem>
<RoleID>6a99c8cd92fc</RoleID>
</RolePathItem>
<RolePathItem>
<RoleID>011e01b51ba0</RoleID>
</RolePathItem>
</Path>
</Expression>
<Expression>
<Literal>
<DataType>String</DataType>
<Value>STANDARD</Value>
</Literal>
</Expression>
</Arguments>
</Function>
</Expression>
<Expression>
<Function>
<FunctionName>In</FunctionName>
<Arguments>
<Expression Name="Trend">
<Path>
<RolePathItem>
<RoleID>6a99c8cd92fc</RoleID>
</RolePathItem>
<RolePathItem>
<RoleID>dad362a5a954</RoleID>
</RolePathItem>
</Path>
</Expression>
<Expression>
<Literal>
<DataType>String</DataType>
<Values>
<Value>Active</Value>
<Value>Active New</Value>
<Value>New</Value>
<Value>Sparse</Value>
</Values>
</Literal>
</Expression>
</Arguments>
</Function>
</Expression>
</Filter>'
;WITH XmlRows AS (
SELECT
CONVERT(VARCHAR(MAX),ref.query('*')) XMLString
FROM #xmlData.nodes('/Filter/Expression') x ( ref )
)
,NameAndValueXml AS
( SELECT CONVERT(XML,
SUBSTRING(XMLString,
CHARINDEX('<Expression Name=',XMLString),
( CHARINDEX('</Expression>',XMLString) - CHARINDEX('<Expression Name=',XMLString) + LEN('</Expression>') ) )
) AS NameXml
,CONVERT(XML,
SUBSTRING(XMLString,
CHARINDEX('<Literal>',XMLString),
( CHARINDEX('</Literal>',XMLString) - CHARINDEX('<Literal>',XMLString) + LEN('</Literal>') ) )
)
AS ValueXml
FROM XmlRows
)
SELECT
NameXml.value('(./Expression/#Name)[1]', 'nvarchar(255)') Name,
CASE
WHEN ValueXml.exist('(/Literal/Values/*)') = 1 THEN
REPLACE(REPLACE(REPLACE(CONVERT(VARCHAR(max),ValueXml.query('(/Literal/Values/*)')),'</Value><Value>' , ','),'</Value>',''),'<Value>','')
ELSE
ValueXml.value('(./Literal/Value)[1]', 'nvarchar(max)')
END AS Value
FROM NameAndValueXml
result:
Name Value
-------- ----------------------------
Business XYZ
Product STANDARD
Trend Active,Active New,New,Sparse

How can I merge several XML-documents into one without namespace accumulation?

I have several SVG-segments in a table stored as xml-document.
Now I need to select all elements from that table, and merge them into one XML-document.
This is the T-SQL code I have:
declare #xml table (xmldocument xml)
insert #xml select '
<svg xmlns="http://www.w3.org/2000/svg" otherattrib="x">
<path id="789" data-objid="0000X2"></path>
</svg>'
insert #xml select '
<svg xmlns="http://www.w3.org/2000/svg" otherattrib="x">
<admin>
<g>
<path></path>
<path data-objid="0000X1"></path>
<path id="123" data-objid="0000X2"></path>
<path id="456" data-objid="0000X3"></path>
</g>
</admin>
<g>
<path></path>
<path data-objid="0000X1"></path>
<path id="789" data-objid="0000X2"></path>
<path id="abc" data-objid="0000X3"></path>
</g>
</svg>'
insert #xml select '
<svg xmlns="http://www.w3.org/2000/svg" otherattrib="x">
<path></path>
</svg>'
insert #xml select '
<svg xmlns="http://www.w3.org/2000/svg" otherattrib="x">
<path id="abc" data-objid="0000X3"></path>
</svg>'
--;WITH XMLNAMESPACES ('http://www.w3.org/2000/svg' AS svg)
;WITH XMLNAMESPACES (default 'http://www.w3.org/2000/svg')
--SELECT
--(
SELECT
--xmldocument
--,c.p.value('.', 'nvarchar(MAX)')
c.p.query('declare default element namespace "http://www.w3.org/2000/svg";.')
FROM #xml AS t
OUTER APPLY t.xmldocument.nodes('/svg//*') AS c(p)
FOR XML PATH(''), root('svg')
-- ) AS merged
But this produces
<svg xmlns="http://www.w3.org/2000/svg">
<path xmlns="http://www.w3.org/2000/svg" id="789" data-objid="0000X2" />
<admin xmlns="http://www.w3.org/2000/svg">
<g>
<path />
<path data-objid="0000X1" />
<path id="123" data-objid="0000X2" />
<path id="456" data-objid="0000X3" />
</g>
</admin>
<g xmlns="http://www.w3.org/2000/svg">
<path />
<path data-objid="0000X1" />
<path id="123" data-objid="0000X2" />
<path id="456" data-objid="0000X3" />
</g>
<path xmlns="http://www.w3.org/2000/svg" />
<path xmlns="http://www.w3.org/2000/svg" data-objid="0000X1" />
<path xmlns="http://www.w3.org/2000/svg" id="123" data-objid="0000X2" />
<path xmlns="http://www.w3.org/2000/svg" id="456" data-objid="0000X3" />
<g xmlns="http://www.w3.org/2000/svg">
<path />
<path data-objid="0000X1" />
<path id="789" data-objid="0000X2" />
<path id="abc" data-objid="0000X3" />
</g>
<path xmlns="http://www.w3.org/2000/svg" />
<path xmlns="http://www.w3.org/2000/svg" data-objid="0000X1" />
<path xmlns="http://www.w3.org/2000/svg" id="789" data-objid="0000X2" />
<path xmlns="http://www.w3.org/2000/svg" id="abc" data-objid="0000X3" />
<path xmlns="http://www.w3.org/2000/svg" />
<path xmlns="http://www.w3.org/2000/svg" id="abc" data-objid="0000X3" />
</svg>
instead of
<svg xmlns="http://www.w3.org/2000/svg">
<path id="789" data-objid="0000X2" />
<admin>
<g>
<path />
<path data-objid="0000X1" />
<path id="123" data-objid="0000X2" />
<path id="456" data-objid="0000X3" />
</g>
</admin>
<g>
<path />
<path data-objid="0000X1" />
<path id="123" data-objid="0000X2" />
<path id="456" data-objid="0000X3" />
</g>
<path />
<path data-objid="0000X1" />
<path id="123" data-objid="0000X2" />
<path id="456" data-objid="0000X3" />
<g>
<path />
<path data-objid="0000X1" />
<path id="789" data-objid="0000X2" />
<path id="abc" data-objid="0000X3" />
</g>
<path />
<path data-objid="0000X1" />
<path id="789" data-objid="0000X2" />
<path id="abc" data-objid="0000X3" />
<path />
<path id="abc" data-objid="0000X3" />
</svg>
What am I missing ? What do I do wrong ?
How to correct this without having to cast to varchar and then do a search-and-replace on "xmlns=..." ?

First of all: The repeated namespaces are not wrong in any way, just annoying and bloating the size of your result...
Unfortunately there is no way to get rid of them in a clean way. Your idea of casting to text and do this on string level is not that bad, if you really need this (but be aware, that casting must go to NVARCHAR and the re-cast to XML can change your XML stucturally (attribute order, CDATA-sections...). Try this:
Short approach
...if you really do not need anything else, than the namespace in the first root <svg>...
DECLARE #NewXML NVARCHAR(MAX)=
(
SELECT t.xmldocument.query('declare default element namespace "http://www.w3.org/2000/svg";svg/*')
FROM #xml AS t
FOR XML PATH('')
);
SELECT CAST(N'<svg xmlns="http://www.w3.org/2000/svg">'
+ REPLACE(#NewXML,' xmlns="http://www.w3.org/2000/svg"','')
+ N'</svg>' AS XML);
More flexible approach
You might try this:
DECLARE #NewXML XML=
(
SELECT t.xmldocument.query('declare default element namespace "http://www.w3.org/2000/svg";svg/*')
FROM #xml AS t
FOR XML PATH(''),TYPE
);
SET #NewXML =CAST(REPLACE(CAST(#NewXML AS NVARCHAR(MAX)),' xmlns="http://www.w3.org/2000/svg"','') AS XML);
--You need to repeat the CAST and REPLACE, otherwise you'd get a lot of xmlns="" defining an empty default namespace for the inner nodes, which was wrong...
WITH XMLNAMESPACES (DEFAULT 'http://www.w3.org/2000/svg')
SELECT #NewXML=CAST(REPLACE(CAST((SELECT #NewXML FOR XML PATH('svg'),TYPE) AS NVARCHAR(MAX)),' xmlns=""','') AS XML);
SELECT #NewXML;
You might use STUFF() to introduce the namespace on string level:
(Don't use WITH XMLNAMESPACES here...
SELECT #NewXML=CAST(STUFF(CAST((SELECT #NewXML FOR XML PATH('svg'),TYPE) AS NVARCHAR(MAX)),5,0,' xmlns="http://www.w3.org/2000/svg" ') AS XML);
SELECT #NewXML;
The result in all cases
<svg xmlns="http://www.w3.org/2000/svg">
<path id="789" data-objid="0000X2" />
<admin>
<g>
<path />
<path data-objid="0000X1" />
<path id="123" data-objid="0000X2" />
<path id="456" data-objid="0000X3" />
</g>
</admin>
<g>
<path />
<path data-objid="0000X1" />
<path id="789" data-objid="0000X2" />
<path id="abc" data-objid="0000X3" />
</g>
<path />
<path id="abc" data-objid="0000X3" />
</svg>

XSLT 1.0 Sort node sets using alphanumeric value of descendant

Hello and thank you in advance.
I'm trying to transform a long and complicated xml document using xslt 1.0. A portion of the source document is below:
<c id="hou01965c00171" level="series">
<did>
<unittitle>Index </unittitle>
</did>
<c id="hou01965c00172" level="subseries">
<did>
<unittitle>A</unittitle>
</did>
<c id="hou01965c00179">
<did>
<unitid>(MS Sparks 132) </unitid>
<unittitle>
<persname>Abbot, Benjamin. </persname>
<geogname>Exeter, N. H. </geogname>-Teaching certificate for <persname>Jared
Sparks.</persname> Dec. 18, <unitdate calendar="gregorian"
datechar="single" endYear="1810" era="ce" startYear="1810"
>1810.</unitdate>
</unittitle>
</did>
<note>
<p>Misc. Paps. I, (14)</p>
<p>ORIGINAL</p>
</note>
</c>
<c id="hou01965c00173">
<did>
<unitid>(MS Sparks 70) </unitid>
<unittitle>A.B. [Haldimand's secret agent] To E. Apr. <unitdate
calendar="gregorian" datechar="single" endYear="1782" era="ce"
startYear="1782">1782.</unitdate>
</unittitle>
</did>
<note>
<p>Vermont Paps., <date calendar="gregorian" endYear="1782" era="ce"
startYear="1780">1780-1782, </date>89-90.</p>
</note>
</c>
<c id="hou01965c00174">
<did>
<unitid>(MS Sparks 85) </unitid>
<unittitle>
<persname>D'Abancourt </persname>to <persname>Lafayette.</persname>
</unittitle>
</did>
<note>
<p>
<list>
<item>June 26, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1792">1792: </date>178-80. </item>
<item>July 26, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1792">1792: </date>222. </item>
<item>July 30, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1792">1792: </date>222-1.</item>
</list>
</p>
<p>Lafayette Letters, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1777">1777-1792 </date>(2).</p>
</note>
</c>
<c id="hou01965c00175">
<did>
<unitid>(MS Sparks 153) </unitid>
<unittitle>
<persname>Abbey, W. M. </persname>To <persname>Jared Sparks.</persname> Feb.
19, <unitdate calendar="gregorian" datechar="single" endYear="1857" era="ce"
startYear="1857">1857, </unitdate>
<geogname>Philadelphia.</geogname>
</unittitle>
</did>
<note>
<p>Letters to Sparks.</p>
<p>ORIGINAL</p>
</note>
</c>
<c id="hou01965c00176">
<did>
<unitid>(MS Sparks 153) </unitid>
<unittitle>
<persname>Abbot, Anne W. </persname>To <persname>Mrs. Sparks.</persname> ---
<unitdate calendar="gregorian" datechar="single" endYear="1853" era="ce"
startYear="1853">1853.</unitdate>
</unittitle>
</did>
<note>
<p>Letters to Sparks.</p>
<p>ORIGINAL</p>
</note>
</c>
</c>
</c>
I would like to return the c/c/c nodes as is, including element tags but I want them to be sorted based on the value of the <unitid>. The values of the <unitid>'s are alphanumeric, e.g. (MS Sparks 70), (MS Sparks 85).
This is my xslt
<xsl:template match="#* | node()">
<xsl:copy>
<xsl:apply-templates select="#* | node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="/ead/archdesc[1]/dsc[1]/c[4]/c/c/did">
<xsl:copy><xsl:apply-templates><xsl:sort select="unitid/text()"></xsl:sort></xsl:apply-templates></xsl:copy>
</xsl:template>
Disregard the predicates in the template match pattern, refers to a much longer document.
I'm not getting anything different in my result document. It basically replicates the source document.
This is what I would like the output to be:
<c id="hou01965c00173">
<did>
<unitid>(MS Sparks 70) </unitid>
<unittitle>A.B. [Haldimand's secret agent] To E. Apr. <unitdate
calendar="gregorian" datechar="single" endYear="1782" era="ce"
startYear="1782">1782.</unitdate>
</unittitle>
</did>
<note>
<p>Vermont Paps., <date calendar="gregorian" endYear="1782" era="ce"
startYear="1780">1780-1782, </date>89-90.</p>
</note>
</c>
<c id="hou01965c00174">
<did>
<unitid>(MS Sparks 85) </unitid>
<unittitle>
<persname>D'Abancourt </persname>to <persname>Lafayette.</persname>
</unittitle>
</did>
<note>
<p>
<list>
<item>June 26, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1792">1792: </date>178-80. </item>
<item>July 26, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1792">1792: </date>222. </item>
<item>July 30, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1792">1792: </date>222-1.</item>
</list>
</p>
<p>Lafayette Letters, <date calendar="gregorian" endYear="1792" era="ce"
startYear="1777">1777-1792 </date>(2).</p>
</note>
</c>
<c id="hou01965c00179">
<did>
<unitid>(MS Sparks 132) </unitid>
<unittitle>
<persname>Abbot, Benjamin. </persname>
<geogname>Exeter, N. H. </geogname>-Teaching certificate for <persname>Jared
Sparks.</persname> Dec. 18, <unitdate calendar="gregorian"
datechar="single" endYear="1810" era="ce" startYear="1810"
>1810.</unitdate>
</unittitle>
</did>
<note>
<p>Misc. Paps. I, (14)</p>
<p>ORIGINAL</p>
</note>
</c>
<c id="hou01965c00175">
<did>
<unitid>(MS Sparks 153) </unitid>
<unittitle>
<persname>Abbey, W. M. </persname>To <persname>Jared Sparks.</persname> Feb.
19, <unitdate calendar="gregorian" datechar="single" endYear="1857" era="ce"
startYear="1857">1857, </unitdate>
<geogname>Philadelphia.</geogname>
</unittitle>
</did>
<note>
<p>Letters to Sparks.</p>
<p>ORIGINAL</p>
</note>
</c>
<c id="hou01965c00176">
<did>
<unitid>(MS Sparks 153) </unitid>
<unittitle>
<persname>Abbot, Anne W. </persname>To <persname>Mrs. Sparks.</persname> ---
<unitdate calendar="gregorian" datechar="single" endYear="1853" era="ce"
startYear="1853">1853.</unitdate>
</unittitle>
</did>
<note>
<p>Letters to Sparks.</p>
<p>ORIGINAL</p>
</note>
</c>
</c>
</c>

read XML in SQL, no data being pulled

I am trying to retrieve data from an XML file. Below is how the XML doc looks and below that is my SQL code. It will run the code and show column headers - but will not populate with any data. What am I missing?
<profile xmlns="http://feed.elasticstats.com/schema/mma/v1/participants-profile.xsd" generated="2015-12-10T17:34:54Z">
<fighters>
<fighter id="01585452-852a-4b40-a6dc-fdd04279f02c" height="72" weight="170" reach="" stance="" first_name="Sai" nick_name="The Boss" last_name="Wang">
<record wins="6" losses="4" draws="1" no_contests="0" />
<born date="1988-01-16" country_code="UNK" country="Unknown" state="" city="" />
<out_of country_code="UNK" country="Unknown" state="" city="" />
</fighter>
<fighter id="0168dd6b-b3e1-4954-8b71-877a63772dec" height="" weight="0" reach="" stance="" first_name="Enrique" nick_name="Wasabi" last_name="Marin">
<record wins="8" losses="2" draws="0" no_contests="0" />
<born date="" country_code="UNK" country="Unknown" state="" city="" />
<out_of country_code="UNK" country="Unknown" state="" city="" />
</fighter>
DECLARE #x xml
SELECT #x = P
FROM OPENROWSET (BULK 'C:\Python27\outputMMA.xml', SINGLE_BLOB) AS FIGHTERS(P)
DECLARE #hdoc int
EXEC sp_xml_preparedocument #hdoc OUTPUT, #x
SELECT *
FROM OPENXML (#hdoc, '/fighters/fighter', 1) --1\ IS ATTRIBUTES AND 2 IS ELEMENTS
WITH (
id varchar(100),
height varchar(10),
last_name varchar(100)
) --THIS IS WHERE YOU SELECT FIELDS you want returned
EXEC sp_xml_removedocument #hdoc

FROM OPENXML is not the best approach any more. Try it like this:
Just copy this into an empty query window and execute:
DECLARE #xml XML=
'<profile xmlns="http://feed.elasticstats.com/schema/mma/v1/participants-profile.xsd" generated="2015-12-10T17:34:54Z">
<fighters>
<fighter id="01585452-852a-4b40-a6dc-fdd04279f02c" height="72" weight="170" reach="" stance="" first_name="Sai" nick_name="The Boss" last_name="Wang">
<record wins="6" losses="4" draws="1" no_contests="0" />
<born date="1988-01-16" country_code="UNK" country="Unknown" state="" city="" />
<out_of country_code="UNK" country="Unknown" state="" city="" />
</fighter>
<fighter id="0168dd6b-b3e1-4954-8b71-877a63772dec" height="" weight="0" reach="" stance="" first_name="Enrique" nick_name="Wasabi" last_name="Marin">
<record wins="8" losses="2" draws="0" no_contests="0" />
<born date="" country_code="UNK" country="Unknown" state="" city="" />
<out_of country_code="UNK" country="Unknown" state="" city="" />
</fighter>
</fighters>
</profile>';
WITH XMLNAMESPACES(DEFAULT 'http://feed.elasticstats.com/schema/mma/v1/participants-profile.xsd')
SELECT One.fighter.value('#id','uniqueidentifier') AS Fighter_ID
,One.fighter.value('#height','int') AS Fighter_Height
,One.fighter.value('#weight','int') AS Fighter_Weigth
,One.fighter.value('#reach','varchar(100)') AS Fighter_Height
,One.fighter.value('#stance','varchar(100)') AS Fighter_Height
,One.fighter.value('#first_name','varchar(100)') AS Fighter_FirstName
,One.fighter.value('#nick_name','varchar(100)') AS Fighter_NickName
,One.fighter.value('#last_name','varchar(100)') AS Fighter_LastName
,One.fighter.value('record[1]/#wins','int') AS FighterRecord_Wins
,One.fighter.value('record[1]/#draws','int') AS FighterRecord_Draws
,One.fighter.value('record[1]/#no_contests','int') AS FighterRecord_NoContest
,One.fighter.value('born[1]/#date','date') AS FighterBorn_Date
,One.fighter.value('born[1]/#country_code','varchar(10)') AS FighterBorn_CountryCode
,One.fighter.value('born[1]/#country','varchar(100)') AS FighterBorn_Country
,One.fighter.value('born[1]/#state','varchar(100)') AS FighterBorn_State
,One.fighter.value('born[1]/#city','varchar(100)') AS FighterBorn_City
,One.fighter.value('out_of[1]/#country_code','varchar(10)') AS FighterOutOf_CountryCode
,One.fighter.value('out_of[1]/#country','varchar(100)') AS FighterOutOf_Country
,One.fighter.value('out_of[1]/#state','varchar(100)') AS FighterOutOf_State
,One.fighter.value('out_of[1]/#city','varchar(100)') AS FighterOutOf_City
FROM #xml.nodes('/profile/fighters/fighter') AS One(fighter)

Firstly repair the data (:xs, </fighters>, </profile>)
<profile xmlns:xs="http://feed.elasticstats.com/schema/mma/v1/participants-profile.xsd" generated="2015-12-10T17:34:54Z">
<fighters>
<fighter id="01585452-852a-4b40-a6dc-fdd04279f02c" height="72" weight="170" reach="" stance="" first_name="Sai" nick_name="The Boss" last_name="Wang">
<record wins="6" losses="4" draws="1" no_contests="0" />
<born date="1988-01-16" country_code="UNK" country="Unknown" state="" city="" />
<out_of country_code="UNK" country="Unknown" state="" city="" />
</fighter>
<fighter id="0168dd6b-b3e1-4954-8b71-877a63772dec" height="" weight="0" reach="" stance="" first_name="Enrique" nick_name="Wasabi" last_name="Marin">
<record wins="8" losses="2" draws="0" no_contests="0" />
<born date="" country_code="UNK" country="Unknown" state="" city="" />
<out_of country_code="UNK" country="Unknown" state="" city="" />
</fighter>
</fighters>
</profile>
Then the code
FROM OPENXML (#docHandle, 'profile/fighters/fighter', 1)
and we are done
01585452-852a-4b40-a6dc-fdd04279f02c 72 Wang
0168dd6b-b3e1-4954-8b71-877a63772dec Marin

You have an undeclared namespace in your XML document. Consider the revision declaring the namespace and referencing it in xpath expression:
DECLARE #x xml;
SELECT #x = P
FROM OPENROWSET (BULK 'C:\Python27\outputMMA.xml', SINGLE_BLOB) AS FIGHTERS(P)
DECLARE #hdoc int
EXEC sp_xml_preparedocument #hdoc OUTPUT, #x,
'<root xmlns:doc="http://feed.elasticstats.com/schema/mma/v1/participants-profile.xsd"/>'
SELECT *
FROM OPENXML (#hdoc, '/doc:profile/doc:fighters/doc:fighter', 1)
WITH (
id varchar(100),
height varchar(10),
last_name varchar(100)
)
EXEC sp_xml_removedocument #hdoc

How to parse XML encoded as UTF-8 from a NVARCHAR(MAX) attribute?

I'm facing a problem to parse an XML string stored in a field of type NVARCHAR(MAX) (I cannot change the type of this field).
Here is my table (WorkingHours) :
CREATE TABLE WorkingHours(
[ID] [int] NOT NULL PRIMARY KEY,
[CONTENT] [nvarchar](MAX) NOT NULL,
-- ...
);
Here is a sample of the [CONTENT] attribute :
<?xml version="1.0" encoding="UTF-8"?>
<calendar>
<day number="1" worked_day="no">
<interval number="1" begin_hour="08:30" end_hour="12:00"/>
<interval number="2" begin_hour="13:30" end_hour="17:00"/>
<interval number="3" begin_hour="" end_hour=""/></day>
<day number="2" worked_day="no">
<interval number="1" begin_hour="08:30" end_hour="12:00"/>
<interval number="2" begin_hour="13:30" end_hour="17:00"/>
<interval number="3" begin_hour="" end_hour=""/>
</day>
<day number="3" worked_day="no">
<interval number="1" begin_hour="08:30" end_hour="12:00"/>
<interval number="2" begin_hour="13:30" end_hour="17:00"/>
<interval number="3" begin_hour="" end_hour=""/>
</day>
<day number="4" worked_day="no">
<interval number="1" begin_hour="08:30" end_hour="12:00"/>
<interval number="2" begin_hour="13:30" end_hour="17:00"/>
<interval number="3" begin_hour="" end_hour=""/>
</day>
<day number="5" worked_day="no">
<interval number="1" begin_hour="08:30" end_hour="12:00"/>
<interval number="2" begin_hour="13:30" end_hour="17:00"/>
<interval number="3" begin_hour="" end_hour=""/>
</day>
<day number="6" worked_day="no">
<interval number="1" begin_hour="" end_hour=""/>
<interval number="2" begin_hour="" end_hour=""/>
<interval number="3" begin_hour="" end_hour=""/>
</day>
<day number="7" worked_day="no">
<interval number="1" begin_hour="" end_hour=""/>
<interval number="2" begin_hour="" end_hour=""/>
<interval number="3" begin_hour="" end_hour=""/>
</day>
</calendar>
As you can see, the data encoding is UTF-8.
Now, I would like to parse this data in order to create some calculations :
DECLARE #RawContent [nvarchar](MAX) = (
SELECT wh.[CONTENT]
FROM [WorkingHours] wh
WHERE wh.[ID] = 100);
DECLARE #XMLContent [Xml] = #RawContent; // KO
-- DECLARE #XMLContent [Xml] = CAST(#RawContent AS XML); // KO
-- DECLARE #XMLContent [Xml] = CONVERT(XML, #RawContent); // KO
-- Just a test to query XML data.
SELECT
C.WD.value('#number', 'int') AS DayId
FROM #XMLContent.nodes('/calendar/day') AS C(WD);
I don't know how to cast the result (a nvarchar(max) field containing UTF-8 XML string) to a XML value.
SQL Server returns the following error :
"Unable to switch encoding"
It refers to the CAST line (when I define the #XMLContent variable).
Any idea to solve that ?

Strip out the processing directive -- it's meaningless and incorrect because the data is already encoded in UTF-16 (since it's stored as NVARCHAR). If you cannot change the data already present, you'll have to rely on (slightly brittle) string replacement:
CAST(REPLACE(wh.[CONTENT], '<?xml version="1.0" encoding="UTF-8"?>', '') AS XML)
Note that explicitly indicating the encoding is UTF-16 instead will also work -- though it adds nothing.

The other option is to convert to a VARCHAR datatype first - which is non-Unicode - and then to XML:
DECLARE #RawContent [nvarchar](MAX) = (
SELECT wh.[CONTENT]
FROM [WorkingHours] wh
WHERE wh.[ID] = 100);
DECLARE #XMLContent XML = CAST(CAST(#RawContent AS VARCHAR(MAX)) AS XML)
-- Just a test to query XML data.
SELECT
C.WD.value('#number', 'int') AS DayId
FROM #XMLContent.nodes('/calendar/day') AS C(WD);

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to put hive table on complex nested XML file - hive

Related

XML parsing using SQL

How can I merge several XML-documents into one without namespace accumulation?

XSLT 1.0 Sort node sets using alphanumeric value of descendant

read XML in SQL, no data being pulled

How to parse XML encoded as UTF-8 from a NVARCHAR(MAX) attribute?

Categories

Resources