Skip to content

Section 7 - Create Raw data processors using IntelliJ

Go back to Getting started guide

In this section we will:

Processing Pipeline

Capitalise LAST name raw processor

Capitalise LAST name Raw Processor

  1. Below we see how to create a Raw Processor to capitalise all characters in the value of the LAST_NAME data point.
  2. Please first compelte Section 5 Data Consolidators before proceeding.
  3. It is important to understand Raw Processors modify the raw data of each separate feed before the values sourced from multiple feeds are consolidated.
  4. As you can see below, since there is no predefined raw processors the meets our requirements, all we have to do is use a rawDPProcessor of type GEN_EXPRESS and write a Groovy script to implement our specific requirements.
  5. We start by opening the SCHEMA_CUSTOMER.xml file we created in the previous section, in IntelliJ. Note: This file is used a starting point just before starting this section.
  6. If you were not able to complete the previous section you could copy the configuration below and paste it into SCHEMA_CUSTOMER.xml to continue with this section.

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
        <?xml version="1.0" encoding="UTF-8"?>
    
        <apiroConf version="1" xmlns="http://apiro.com/apiro/v1/root">
            <groups/>
            <loadOrder>15</loadOrder>
            <schemas>
                <schema defBacked="false" historical="false" name="CUSTOMER">
                    <groupTags>
                        <groupTag>EXAMPLES</groupTag>
                    </groupTags>
                    <metaData/>
                    <identityKeys>
                        <identityKey>BAC</identityKey>
                    </identityKeys>
    
                    <!-- Data Point descriptions -->
                    <dataPoints>
                        <dataPoint name="BAC"
                                   dataType="STRING"
                                   canEditValid="true"
                                   canEditViolated="true"
                                   displayName="BAC">
                            <nullable>false</nullable>
    
                            <metaData>
                                <item name="piiClassification">
                                    <simpleValues>
                                        <simpleValue>High Risk</simpleValue>
                                    </simpleValues>
                                </item>
                            </metaData>
    
                            <!-- BAC data point processors -->
                            <rawDPValidators/>
                            <rawDPProcessors/>
                            <!--consolidationAlgorithm></consolidationAlgorithm -->
                            <consDPValidators/>
                            <consDPProcessors/>
                        </dataPoint>
    
                        <dataPoint name="FIRST_NAME"
                                   dataType="STRING"
                                   displayName="First Name"
                                   canEditValid="true"
                                   canEditViolated="true">
                            <rawDPValidators>
                                <rawDPValidator name="IN_BAC_SET_CHECK " entity="IN_SET">
                                    <config>
                                        <![CDATA[
                                    {
                                        ignoreCase : true,
                                        options : [ "Tom", "Bob"]
                                    }
                                ]]>
                                    </config>
                                </rawDPValidator>
                            </rawDPValidators>
    
                            <consDPValidators>
                                <consDPValidator name="INVALID_IF_CONSOLIDATED_NULL" entity="NOT_NULL"/> 
                            </consDPValidators>
                        </dataPoint>
    
                        <dataPoint name="LAST_NAME" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="LAST NAME"/>
                        <dataPoint name="ADDRESS" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="ADDRESS"/>
                        <dataPoint name="PHONE_NUMBER" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="PHONE NUMBER"/>
                        <dataPoint  name="AGE" dataType="INTEGER" canEditValid="true" canEditViolated="true" displayName="Age">
                            <rawDPValidators>
                                <rawDPValidator name="INVALID_IF_NULL" entity="NOT_NULL"/> // The name can be anything and it will appear in data audit/lineage
                                <rawDPValidator name="INVALID_IF_NEGATIVE" entity="POSITIVE">
                                    <lateBound>false</lateBound> // This is the default value if one is not specified
                                </rawDPValidator>
                            </rawDPValidators>
                        </dataPoint>
                        <dataPoint name="YEARLY_INCOME" canEditValid="false" canEditViolated="true" dataType="DECIMAL" displayName="YEARLY INCOME"/>
                        <dataPoint name="TFN" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="TFN"/>
    
                        <dataPoint name="PORTFOLIO_VALUE"
                                   displayName="Investment Portfolio Value"
                                   dataType="DECIMAL"
                                   canEditValid="false"
                                   canEditViolated="true" >
    
                                <consolidationAlgorithm name="PORTF_VALUE_WEIGHTED_MEAN_01" entity="GEN_EXPRESS">
                                    <config>
                                        <![CDATA[
                                            #GRV{
                                                def list= []
    
                                                list.add(items.get("CUSTOMERS_A_XLSX"))
                                                list.add(items.get("CUSTOMERS_B_XLSX"))
                                                list.remove(null)
    
                                                if(list.size()==0)
                                                    return 0;
                                                else if (list.size() == 1)
                                                    return list[0]
                                                else {
                                                    return (list[0].asDBL()*0.8 + list[1].asDBL()*0.2)
                                                }
                                            }
                                            ]]>
                                    </config>
                                </consolidationAlgorithm>
                        </dataPoint>
    
                        <dataPoint name="COMPANY_NAME" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="COMPANY NAME"/>
                        <dataPoint name="COMPANY_ADDRESS" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="COMPANY ADDRESS"/>
                        <dataPoint name="PROFILE_IMAGE" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="PROFILE_IMAGE"/>
                        <dataPoint name="COMPANY_WEBSITE" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="COMPANY WEBSITE"/>
                        <dataPoint name="XML_ROOT_DOC"  canEditValid="false" canEditViolated="true"  displayName="XML Root Doc" dataType="XML"/>
                        <dataPoint name="JSON_ROOT_DOC"  canEditValid="false" canEditViolated="true"  displayName="JSON Root Doc" dataType="JSON"/>
                    </dataPoints>
                    <schemaAppliedProcessors>
                        <groupTags>
                            <groupTag>DEFAULT</groupTag>
                        </groupTags>
                        <metaData/>
                        <rawDPValidators/>
                        <rawDPProcessors/>
                        <consDPValidators/>
                        <consDPProcessors/>
                        <dataBlockProcessors/>
                    </schemaAppliedProcessors>
                    <alerts/>
                </schema>
            </schemas>
        </apiroConf>
    

  7. Copy the LAST_NAME data discriptor below and override the existing data discriptor in the CUSTOMER schema above.

  8. You must now push your updated SCHEMA_CUSTOMER.xml file to GIT and deploy as per the instructions provided at the bottom of this page to reload the configuration.

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    <dataPoint name="LAST_NAME" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="LAST NAME">
        <rawDPProcessors>
            <rawDPProcessor name="CAPITALISE_LAST_NAME_RAW_PROC" entity="GEN_EXPRESS">
                <config>
                    <![CDATA[
                        #GRV{
                            CTX['.'] = CTX['.'].toUpperCase()
                        }
                    ]]>
                </config>
    
            </rawDPProcessor>
        </rawDPProcessors>
    </dataPoint>
    
  9. We can see from the screenshot below that the sourced Last Names are not capitalised.

    first_name_raw

  10. After the processor is deployed and the feeds are triggered we can see the Last Names capitalised.

    first_name_capitilised

  11. If we double click on any last name ie. JONES in the aggregated data table.

    show_data_audit

  12. Finally, we can see the data audit page for the specific data point LAST_NAME which processor was the one that capitalised all leters.

    data_audit

Configuration files

Completed configuration files
  • This is the completed CUSTOMER schema configuration file that add the raw data processor discussed above.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
    <?xml version="1.0" encoding="UTF-8"?>

    <apiroConf version="1" xmlns="http://apiro.com/apiro/v1/root">
        <groups/>
        <loadOrder>15</loadOrder>
        <schemas>
            <schema defBacked="false" historical="false" name="CUSTOMER">
                <groupTags>
                    <groupTag>EXAMPLES</groupTag>
                </groupTags>
                <metaData/>
                <identityKeys>
                    <identityKey>BAC</identityKey>
                </identityKeys>

                <!-- Data Point descriptions -->
                <dataPoints>
                    <dataPoint name="BAC"
                               dataType="STRING"
                               canEditValid="true"
                               canEditViolated="true"
                               displayName="BAC">
                        <nullable>false</nullable>

                        <metaData>
                            <item name="piiClassification">
                                <simpleValues>
                                    <simpleValue>High Risk</simpleValue>
                                </simpleValues>
                            </item>
                        </metaData>

                        <!-- BAC data point processors -->
                        <rawDPValidators/>
                        <rawDPProcessors/>
                        <!--consolidationAlgorithm></consolidationAlgorithm -->
                        <consDPValidators/>
                        <consDPProcessors/>
                    </dataPoint>

                    <dataPoint name="FIRST_NAME"
                               dataType="STRING"
                               displayName="First Name"
                               canEditValid="true"
                               canEditViolated="true">
                        <rawDPValidators>
                            <rawDPValidator name="IN_BAC_SET_CHECK " entity="IN_SET">
                                <config>
                                    <![CDATA[
                                {
                                    ignoreCase : true,
                                    options : [ "Tom", "Bob"]
                                }
                            ]]>
                                </config>
                            </rawDPValidator>
                        </rawDPValidators>

                        <consDPValidators>
                            <consDPValidator name="INVALID_IF_CONSOLIDATED_NULL" entity="NOT_NULL"/> 
                        </consDPValidators>
                    </dataPoint>

                    <dataPoint name="LAST_NAME" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="LAST NAME">
                        <rawDPProcessors>
                            <rawDPProcessor name="CAPITALISE_LAST_NAME_RAW_PROC" entity="GEN_EXPRESS">
                                <config>
                                    <![CDATA[
                                        #GRV{
                                            CTX['.'] = CTX['.'].toUpperCase()
                                        }
                                    ]]>
                                </config>

                            </rawDPProcessor>
                        </rawDPProcessors>
                    </dataPoint>

                    <dataPoint name="ADDRESS" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="ADDRESS"/>
                    <dataPoint name="PHONE_NUMBER" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="PHONE NUMBER"/>
                    <dataPoint  name="AGE" dataType="INTEGER" canEditValid="true" canEditViolated="true" displayName="Age">
                        <rawDPValidators>
                            <rawDPValidator name="INVALID_IF_NULL" entity="NOT_NULL"/> // The name can be anything and it will appear in data audit/lineage
                            <rawDPValidator name="INVALID_IF_NEGATIVE" entity="POSITIVE">
                                <lateBound>false</lateBound> // This is the default value if one is not specified
                            </rawDPValidator>
                        </rawDPValidators>
                    </dataPoint>
                    <dataPoint name="YEARLY_INCOME" canEditValid="false" canEditViolated="true" dataType="DECIMAL" displayName="YEARLY INCOME"/>
                    <dataPoint name="TFN" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="TFN"/>

                    <dataPoint name="PORTFOLIO_VALUE"
                               displayName="Investment Portfolio Value"
                               dataType="DECIMAL"
                               canEditValid="false"
                               canEditViolated="true" >

                            <consolidationAlgorithm name="PORTF_VALUE_WEIGHTED_MEAN_01" entity="GEN_EXPRESS">
                                <config>
                                    <![CDATA[
                                        #GRV{
                                            def list= []

                                            list.add(items.get("CUSTOMERS_A_XLSX"))
                                            list.add(items.get("CUSTOMERS_B_XLSX"))
                                            list.remove(null)

                                            if(list.size()==0)
                                                return 0;
                                            else if (list.size() == 1)
                                                return list[0]
                                            else {
                                                return (list[0].asDBL()*0.8 + list[1].asDBL()*0.2)
                                            }
                                        }
                                        ]]>
                                </config>
                            </consolidationAlgorithm>
                    </dataPoint>

                    <dataPoint name="COMPANY_NAME" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="COMPANY NAME"/>
                    <dataPoint name="COMPANY_ADDRESS" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="COMPANY ADDRESS"/>
                    <dataPoint name="PROFILE_IMAGE" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="PROFILE_IMAGE"/>
                    <dataPoint name="COMPANY_WEBSITE" canEditValid="false" canEditViolated="true" dataType="STRING" displayName="COMPANY WEBSITE"/>
                    <dataPoint name="XML_ROOT_DOC"  canEditValid="false" canEditViolated="true"  displayName="XML Root Doc" dataType="XML"/>
                    <dataPoint name="JSON_ROOT_DOC"  canEditValid="false" canEditViolated="true"  displayName="JSON Root Doc" dataType="JSON"/>
                </dataPoints>
                <schemaAppliedProcessors>
                    <groupTags>
                        <groupTag>DEFAULT</groupTag>
                    </groupTags>
                    <metaData/>
                    <rawDPValidators/>
                    <rawDPProcessors/>
                    <consDPValidators/>
                    <consDPProcessors/>
                    <dataBlockProcessors/>
                </schemaAppliedProcessors>
                <alerts/>
            </schema>
        </schemas>
    </apiroConf>

Deploy config files
  • Follow these steps Config Deployment to deploy and start using your configuration files.