Skip to content

Section 6 - Create Data Cleansers using IntelliJ

Go back to Getting started guide

In this section we will:

Processing Pipeline

Data Cleanser
  1. We will now focus on the data YEARLY_INCOME data point for Tom Jones.
  2. Note: Before proceeding, you must ensure that the YEARLY_INCOME dataType is DECIMAL and not a STRING.
  3. Note: For simplicity, the table below does not show all the available columns.

    SOURCE BAC FIRST_NAME LAST_NAME YEARLY_INCOME
    customers_a.xlsx BAC111111 Tom Jones AUD 100,000
    customers_b.xlsx BAC111111 Tom Jones 87000
  4. Please note the AUD in the first sourced value.

  5. Notice that the specified data type for the data point YEARLY_INCOME is DECIMAL.
    1
    2
    3
    4
    5
    6
        <dataPoint name="YEARLY_INCOME"
                               displayName="Yearly Income"
                               dataType="DECIMAL"
                               canEditValid="false"
                               canEditViolated="true" >
        </dataPoint>
    
  6. Notice the sourced value AUD91000 is causing a Non convertible data type error because the provided value is a STRING not a DECIMAL. incovertible_data_type

  7. So how do we overcome this problem? The sourced value is correct but requires cleansing before it can proceed with the pipeline processors.

  8. We need to implement a cleanser that cleanses values before entering the processing pipeline.

  9. Data cleansers are used to cleanup data points before they enter the processing.

  10. They are defined in the config feeds, in this case we will implement one the feed CUSTOMERS_A_XLSX.
  11. We start by opening FEED_CUSTOMERS_A_XLSX.xml completed in previous sections and also provided below.

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
          <?xml version="1.0" encoding="UTF-8"?>
    
            <apiroConf version="1" xmlns="http://apiro.com/apiro/v1/root">
              <groups/>  
              <loadOrder>30</loadOrder>
              <dataFeeds> 
                <dataFeed definition="EXPR_EXCEL_FEED2" name="CUSTOMERS_A_XLSX">
                  <groupTags> 
                    <groupTag>DEFAULT</groupTag> 
                  </groupTags>  
                  <metaData/>  
                  <abstract>false</abstract>  
                  <inheritable>true</inheritable>  
                  <description/>  
                  <execPriority>50</execPriority>  
                  <execPredicate>#GRV{true}</execPredicate>  
                    <config>
                        <![CDATA[{
                            "dataSource": {
                              "config": {
                                "username": "${SYS:APIRO_GITHUB_USERNAME}",
                                "password": "${SYS:APIRO_GITHUB_PW}",
                                "gitURL": "https://github.com/redapiro/apiro_examples.git",
                                "branch": "main",
                                "pathPrefix": "/artifacts/source_files/customers_a.xlsx"
                              },
                              "entity": "GIT"
                            },
                            "dataAtLine": 2,
                            "sheet": "data",
                            "itemLimit": 20,
                            "explicitMappings": [
                              {
                                "dictionary": "BAC",
                                "value": "#GRV{PAYLOAD.resolve('A')}"
                              },
                              {
                                "dictionary": "FIRST_NAME",
                                "value": "#GRV{PAYLOAD.resolve('B')}"
                              },
                              {
                                "dictionary": "LAST_NAME",
                                "value": "#GRV{PAYLOAD.resolve('C')}"
                              },
                              {
                                "dictionary": "ADDRESS",
                                "value": "#GRV{PAYLOAD.resolve('D')}"
                              },
                              {
                                "dictionary": "PHONE_NUMBER",
                                "value": "#GRV{PAYLOAD.resolve('E')}"
                              },
                              {
                                "dictionary": "AGE",
                                "value": "#GRV{PAYLOAD.resolve('F')}"
                              },
                              {
                                "dictionary": "YEARLY_INCOME",
                                "value": "#GRV{PAYLOAD.resolve('G')}"
                              },
                              {
                                "dictionary": "TFN",
                                "value": "#GRV{PAYLOAD.resolve('H')}"
                              },
                              {
                                "dictionary": "PORTFOLIO_VALUE",
                                "value": "#GRV{PAYLOAD.resolve('I')}"
                              },
                              {
                                "dictionary": "COMPANY_NAME",
                                "value": "#GRV{PAYLOAD.resolve('J')}"
                              },
                              {
                                "dictionary": "COMPANY_ADDRESS",
                                "value": "#GRV{PAYLOAD.resolve('K')}"
                              },
                              {
                                "dictionary": "COMPANY_WEBSITE",
                                "value": "#GRV{PAYLOAD.resolve('L')}"
                              },
                              {
                                "dictionary": "PROFILE_IMAGE",
                                "value": "#GRV{PAYLOAD.resolve('M')}"
                              },
                              {
                                "dictionary": "XML_ROOT_DOC",
                                "value": "#GRV{PAYLOAD.resolve('N')}"
                              },
                              {
                                "dictionary": "JSON_ROOT_DOC",
                                "value": "#GRV{PAYLOAD.resolve('O')}"
                              }
                            ]
                          }
                        ]]>
                      </config>
                  <longLived>true</longLived>  
                  <enabled>false</enabled>  
                  <schema>CUSTOMERS</schema>
                  <push>true</push>  
                  <pull>true</pull>  
                  <cronTriggers> 
                    <cronTrigger> 
                      <description>Every day at 6pm (18:00)</description>  
                      <cron>0 0 18 ? * * *</cron> 
                    </cronTrigger> 
                  </cronTriggers> 
                </dataFeed> 
              </dataFeeds>  
              <dataSinks/> 
            </apiroConf>
    

  12. We need to use a GEN_EXPRESS Data Clenaser. The GEN_EXPRESS cleanser enables us to use a Groovy script to implement the required cleansing algorithm.

  13. Data cleansers need to go into the feed config files and not the schema config files.
  14. Replace existing YEARLY_INCOME data point mapping
    1
    2
    3
    4
        {
            "dictionary": "YEARLY_INCOME",
            "value": "#GRV{PAYLOAD.resolve('G')}",
        },
    
  15. with this one which includes a cleanser.
  16. This REMOVE_CURRENCY_FROM_YEARLY_INCOME will remove the first 3 characters of the provided string before converting it to a DECIMAL.

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
        {
            "dictionary": "YEARLY_INCOME",
            "value": "#GRV{PAYLOAD.resolve('G')}",
            "cleansers" :
            [
                {
                    "name":"REMOVE_CURRENCY_FROM_YEARLY_INCOME",
                    "entity":"GEN_EXPRESS",
                    "config":
                    {
                        "root":"#GRV{ return VALUE.replaceFirst(/^[a-zA-Z]{3}/, '')}"
                    }
                }
            ]
        },
    
  17. Push your configuration change to GIT and deploy as per the instructions provided at the bottom of this page to reload the configuration.

  18. We can see from the screenshot below that the the value AUD91000 was cleansed to 91000

Default cleansed_value Algorithm

  • Apiro provides the option to use AI instead of using traditional programming languages like groovy.
  • The example below will use GPT to achieve the same result but it will cover more cases.
  • eg. Both AUD91000 and 91AUD000 will produce 91000
  • GPT data transfomers are already avaialble in Apiro but GPT and Llama and other Generative AI processors will be available in the next Apiro release release.

Configuration files

Completed configuration files
  • This is the completed FEED_CUSTOMERS_A_XLSX.xml we enhanced in this section which now includes a YEARLY_INCOME cleanser.
      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
        <?xml version="1.0" encoding="UTF-8"?>
    
        <apiroConf version="1" xmlns="http://apiro.com/apiro/v1/root">
          <groups/>  
          <loadOrder>30</loadOrder>
          <dataFeeds> 
            <dataFeed definition="EXPR_EXCEL_FEED2" name="CUSTOMERS_A_XLSX">
              <groupTags> 
                <groupTag>DEFAULT</groupTag> 
              </groupTags>  
              <metaData/>  
              <abstract>false</abstract>  
              <inheritable>true</inheritable>  
              <description/>  
              <execPriority>50</execPriority>  
              <execPredicate>#GRV{true}</execPredicate>  
                <config>
                    <![CDATA[{
                        "dataSource": {
                          "config": {
                            "username": "${SYS:APIRO_GITHUB_USERNAME}",
                            "password": "${SYS:APIRO_GITHUB_PW}",
                            "gitURL": "https://github.com/redapiro/apiro_examples.git",
                            "branch": "main",
                            "pathPrefix": "/artifacts/source_files/customers_a.xlsx"
                          },
                          "entity": "GIT"
                        },
                        "dataAtLine": 2,
                        "sheet": "data",
                        "itemLimit": 20,
                        "explicitMappings": [
                          {
                            "dictionary": "BAC",
                            "value": "#GRV{PAYLOAD.resolve('A')}"
                          },
                          {
                            "dictionary": "FIRST_NAME",
                            "value": "#GRV{PAYLOAD.resolve('B')}"
                          },
                          {
                            "dictionary": "LAST_NAME",
                            "value": "#GRV{PAYLOAD.resolve('C')}"
                          },
                          {
                            "dictionary": "ADDRESS",
                            "value": "#GRV{PAYLOAD.resolve('D')}"
                          },
                          {
                            "dictionary": "PHONE_NUMBER",
                            "value": "#GRV{PAYLOAD.resolve('E')}"
                          },
                          {
                            "dictionary": "AGE",
                            "value": "#GRV{PAYLOAD.resolve('F')}"
                          },
                          {
                            "dictionary": "YEARLY_INCOME",
                            "value": "#GRV{PAYLOAD.resolve('G')}",
                            "cleansers" :
                            [
                                {
                                    "name":"REMOVE_CURRENCY_FROM_YEARLY_INCOME",
                                    "entity":"GEN_EXPRESS",
                                    "config":
                                    {
                                        "root":"#GRV{ return VALUE.replaceFirst(/^[a-zA-Z]{3}/, '')}"
                                    }
                                }
                            ]
                          },
                          {
                            "dictionary": "TFN",
                            "value": "#GRV{PAYLOAD.resolve('H')}"
                          },
                          {
                              "dictionary" : "PORTFOLIO_VALUE",
                              "value" : "#GRV{PAYLOAD.resolve('I')}",
                          },
                          {
                            "dictionary": "COMPANY_NAME",
                            "value": "#GRV{PAYLOAD.resolve('J')}"
                          },
                          {
                            "dictionary": "COMPANY_ADDRESS",
                            "value": "#GRV{PAYLOAD.resolve('K')}"
                          },
                          {
                            "dictionary": "COMPANY_WEBSITE",
                            "value": "#GRV{PAYLOAD.resolve('L')}"
                          },
                          {
                            "dictionary": "PROFILE_IMAGE",
                            "value": "#GRV{PAYLOAD.resolve('M')}"
                          }
                        ]
                      }
                    ]]>
                  </config>
              <longLived>true</longLived>  
              <enabled>false</enabled>  
              <schema>CUSTOMERS</schema>
              <push>true</push>  
              <pull>true</pull>  
              <cronTriggers> 
                <cronTrigger> 
                  <description>Every day at 6pm (18:00)</description>  
                  <cron>0 0 18 ? * * *</cron> 
                </cronTrigger> 
              </cronTriggers> 
            </dataFeed> 
          </dataFeeds>  
          <dataSinks/> 
        </apiroConf>
    
Deploy config files
  • Follow these steps Config Deployment to deploy and start using your configuration files.