XSLT Split large single parent node, group into smaller child-nodes
I recently asked this question, but realize I didn't explain it very
clearly. I have a large .csv file (8000k+ lines) composed of invoices,
with multiple lines per invoices. I am parsing that into an XML structure
as shown below (simplified).
Input 1 - $XMLInput
<?xml version="1.0" encoding="UTF-8"?>
<root>
<row>
<invoiceNumber>1</invoiceNumber>
<invoiceText>invoice 1-1</invoiceText>
<position>1<position>
...
</row>
<row>
<invoiceNumber>1</invoiceNumber>
<invoiceText>invoice 1-2</invoiceText>
<position>2<position>
...
</row>
<row>
<invoiceNumber>2</invoiceNumber>
<invoiceText>invoice 2-1</invoiceText>
<position>3<position>
...
</row>
<row>
<invoiceNumber>2</invoiceNumber>
<invoiceText>invoice 2-2</invoiceText>
<position>4<position>
...
</row>
<row>
<invoiceNumber>3</invoiceNumber>
<invoiceText>invoice 3-1</invoiceText>
<position>5<position>
...
</row>
<row>
<invoiceNumber>3</invoiceNumber>
<invoiceText>invoice 3-2</invoiceText>
<position>6<position>
...
</row>
</roow>
Input 2 - $maxBatchSize Description: Break to next batch after it gets
larger than this size (constant)
Input 3 - $countInvoices Description: Total number of unique invoice
numbers in the file (constant)
Input 4 - $listOfInvoices Description: Recurring variable of unique
invoice numbers in document. Example:
<root>
<row>
<invoiceNumber>1</invoiceNumber>
</row>
<row>
<invoiceNumber>2</invoiceNumber>
</row>
<row>
<invoiceNumber>3</invoiceNumber>
</row>
</root>
To improve performance time, I need to group these elements by
invoiceNumber, into batches no bigger than X nodes each (variable to be
imported). From there I will send each batch to a child processor in
parallel, instead of processing the entire original document at once.
E.g., in the example XML doc above, if the batch size could be no larger
than 3, I would need the following XML output:
Output 1 - $XMLOutput
<root>
<batch>
<row>
<invoiceNumber>1</invoiceNumber>
<invoiceText>invoice 1-1</invoiceText>
<position>1<position>
...
</row>
<row>
<invoiceNumber>1</invoiceNumber>
<invoiceText>invoice 1-2</invoiceText>
<position>2<position>
...
</row>
<row>
<invoiceNumber>2</invoiceNumber>
<invoiceText>invoice 2-1</invoiceText>
<position>3<position>
...
</row>
<row>
<invoiceNumber>2</invoiceNumber>
<invoiceText>invoice 2-2</invoiceText>
<position>4<position>
...
</row>
</batch>
<batch>
<row>
<invoiceNumber>3</invoiceNumber>
<invoiceText>invoice 3-1</invoiceText>
<position>5<position>
...
</row>
<row>
<invoiceNumber>3</invoiceNumber>
<invoiceText>invoice 3-2</invoiceText>
<position>6<position>
...
</row>
</batch>
</root>
It is a requirement that all the lines for an invoice are sent in the same
batch. My initial XSLT attempt is below (2.0), I tried to emulate a while
loop, keep appending groups of invoices to the current node by recursively
calling the template. When the max batch size is reached, I recursively
call the batch template to create a new batch. I'm passing the invoice and
batch counter between each recursive call.
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0"
xmlns:bpws="http://schemas.xmlsoap.org/ws/2003/03/business-process/"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:variable name="XMLInput"
select="bpws:getVariableData('XMLInputByBU')"/>
<xsl:variable name="listOfInvoices"
select="bpws:getVariableData('listOfUniqueInvoices')"/>
<xsl:variable name="defaultBatchSize"
select="bpws:getVariableData('defaultBatchSize')"/>
<xsl:variable name="countInvoices"
select="bpws:getVariableData('countUniqueInvoices')"/>
<xsl:key name="invoiceNumberKey" match="row" use="invoiceNumber" />
<xsl:template match="/">
<!-- initialize invoice loop counter -->
<xsl:variable name="invoiceCounter" select="1"/>
<xsl:element name="root">
<xsl:call-template name="batch">
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
</xsl:call-template>
</xsl:element>
</xsl:template>
<xsl:template name="batch">
<!-- new batch, initialize batch loop counter-->
<xsl:param name="invoiceCounter"/>
<xsl:variable name="batchCounter" select="0"/>
<xsl:element name="root">
<xsl:call-template name="invoice">
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
<xsl:with-param name="batchCounter" select="0"/>
</xsl:call-template>
</xsl:element>
<!-- while there are still invoices in file to process, create
next batch-->
<xsl:if test="$invoiceCounter < $countInvoices">
<xsl:call-template name="batch">
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<xsl:template name="invoice">
<!-- template grabs all records from file with invoice number-->
<xsl:param name="invoiceCounter"/>
<xsl:param name="batchCounter"/>
<xsl:variable name="invoiceNumber">
<xsl:value-of
select="$listOfInvoices/*:root/*:row[$invoiceCounter]/*:invoiceNumber4z"/>
</xsl:variable>
<!-- count records in next invoice group, increment batch counter
by that amount-->
<xsl:variable name="countBatchSize">
<xsl:value-of
select="count($XMLInput/*:root/*:row[invoiceNumber=$invoiceNumber])"/>
</xsl:variable>
<xsl:variable name="batchCounter">
<xsl:value-of select="$batchCounter + $countBatchSize"/>
</xsl:variable>
<!-- increment invoice Counter-->
<xsl:variable name="invoiceCounter">
<xsl:value-of select="$invoiceCounter + 1"/>
</xsl:variable>
<!-- grab all lines for next invoice-->
<xsl:for-each select="key('invoiceNumberKey',$invoiceNumber)">
<xsl:value-of select='*'/>
</xsl:for-each>
<!-- while the batch is less than max batch size, and there are
still invoices left, recursively call invoice template-->
<xsl:if test="$invoiceCounter < $countInvoices and
$batchCounter < $defaultBatchSize">
<xsl:call-template name="invoice">
<xsl:with-param name="batchCounter" select="$batchCounter"/>
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
</xsl:call-template>
</xsl:if>
<!-- while batch is too large, but there are still invoices left,
recursively call next batch template-->
<xsl:if test="$invoiceCounter < $countInvoices and
$batchCounter > $defaultBatchSize">
<xsl:call-template name="batch">
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
</xsl:call-template>
</xsl:if>
<!-- else finished-->
</xsl:template>
</xsl:stylesheet>
The problem with this is when the second time I call the batch template
recursively, I end up with a structure like below:
<root>
<batch>
<row>...</row>
<row>...</row>
<batch>
<row>...</row>
<row>...</row>
</batch>
</batch>
</root>
And I know that variables are immutable, so I can't wait for the first
batch template to finish, and know what the loop counter is to pass to the
next iteration. Any thoughts?
UPDATE: I'm trying to put a for-each-group select=row(position > current
line number and position < current line number + batch size)/invoiceNumber
so it would only select invoice numbers between those ranges. Then I
wouldn't have to push batch/invoice Counts to the next template. Still
trying to work through errors.
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0"
xmlns:bpws="http://schemas.xmlsoap.org/ws/2003/03/business-process/"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:variable name="input"
select="bpws:getVariableData('sortedInvoicesByBU')"/>
<xsl:variable name="listOfInvoices"
select="bpws:getVariableData('listOfUniqueInvoices')"/>
<xsl:variable name="defaultBatchSize"
select="bpws:getVariableData('defaultBatchSize')"/>
<xsl:variable name="countInvoices"
select="bpws:getVariableData('countUniqueInvoices')"/>
<xsl:key name="invoiceNumberKey" match="row" use="invoiceNumber" />
<xsl:template match="/">
<!-- initialize invoice loop counter -->
<xsl:variable name="invoiceCounter" select="1"/>
<xsl:variable name="lineCounter" select="1"/>
<xsl:element name="batches">
<xsl:call-template name="batch">
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
</xsl:call-template>
</xsl:element>
</xsl:template>
<xsl:template name="batch">
<xsl:param name="invoiceCounter"/>
<xsl:param name="lineCounter"/>
<xsl:element name="UPSData">
<!-- For each invoice number between current line number and +
defaultBatchSize, send to invoice template -->
<xsl:for-each-group select="$input/*:UPSData/*:row[position()
> $lineCounter and position() < ($lineCounter +
$defaultBatchSize)]" group-by="invoiceNumber4z">
<xsl:call-template name="invoice">
<xsl:with-param name="invoiceNumber"
select="invoiceNumber4z"/>
</xsl:call-template>
<xsl:variable name="invoiceCounter">
<xsl:value-of select="$invoiceCounter + 1"/>
</xsl:variable>
<xsl:variable name="lineCounter">
<xsl:value-of select="$lineCounter +
count($input/*:UPSData/*:row[invoiceNumber4z=invoiceNumber4z])
+ 1"/>
</xsl:variable>
</xsl:for-each-group>
</xsl:element>
<!-- If still more invoices to process-->
<xsl:if test="$invoiceCounter < $countInvoices">
<xsl:call-template name="batch">
<xsl:with-param name="invoiceCounter"
select="$invoiceCounter"/>
<xsl:with-param name="lineCounter" select="$lineCounter"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<xsl:template name="invoice">
<xsl:param name="invoiceNumber"/>
<!-- grab all lines for invoice-->
<xsl:for-each select="key('invoiceNumberKey',$invoiceNumber)">
<xsl:value-of select='*'/>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
No comments:
Post a Comment