Skip to content

Transformer - PDF Text Extract

Description

The PDF Text Extract transformer is designed to extract textual data from PDF documents, converting unstructured PDF content into structured JSON format. This transformer is particularly useful for applications that require automated data extraction from PDF documents, such as invoices, contracts, or any other forms of structured documents. It leverages advanced text extraction techniques to accurately identify and extract relevant information, making it easier to process and analyze the data.


Config

Parameters

Parameter Type Default Description
elements Array N/A An array of objects defining the structure of the extracted data.
dataSource Object N/A Configuration for the data source, including entity type and specific settings.
positionalSort Boolean false Determines whether the extracted data should be sorted based on its position in the document.

Example

There are multiple ways to set up a PDF TEXT EXTRACT transformer. using the elements JsonArray gives you much more control over the output structures of the transformer. It allows you to define custom object containing selected fields read from the PDF document. Here is an example below:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
<apiroConf version="1" xmlns="http://apiro.com/apiro/v1/root">
    <loadOrder>20</loadOrder>
    <dataFeeds>
        <dataFeed definition="EXPR_PDF_FEED" name="INVOICE_PDF">
            <execPriority>10</execPriority>
            <enabled>true</enabled>
            <push>false</push>
            <pull>true</pull>
            <schema>INVOICE</schema>
            <config><![CDATA[
{
    "dataSource": {
        "entity": "GIT",
        "config": {
            "password": "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",
            "gitURL": "https://github.com/redapiro/apiro_engine_test_feeds.git",
            "branch": "rudtest",
            "pathPrefix": "/rudtest/invoice.pdf",
            "username": "apirobot"
        }
    },
    "elements": [
        {
            "name": "biller",
            "fields": [
                "name",
                "address",
                "city", "country",
                "postcode"
            ],
            "type": "OBJECT",
            "vertical": true,
            "line": 2,
            "column": 1
        },
        {
            "name": "billee",
            "fields": [
                "name",
                "address",
                "city",
                "country",
                "postcode"
            ],
            "type": "OBJECT",
            "vertical": true,
            "line": 8,
            "column": 1
        },
        {
            "name": "inv_number",
            "type": "FIELD",
            "line": 14,
            "column": 1
        },
        {
            "name": "due_date",
            "type": "FIELD",
            "line": 18,
            "column": 1
        },
        {
            "name": "total",
            "type": "FIELD",
            "dynamicLineStart": {
                "colOrLine": 1,
                "match": "NOTES:"
            },
            "line": 2,
            "column": 1
        },
        {
            "name": "line_items",
            "type": "ARRAY",
            "vertical": false,
            "fields": [
                "code",
                "description",
                "quantity",
                "price",
                "tax",
                "amount"
            ],
            "column": 1,
            "line": 1,
            "dynamicLineStart": {
                "colOrLine": 1,
                "match": "ITEMS"
            },
            "dynamicLineEnd": {
                "colOrLine": 1,
                "match": "NOTES:"
            }
        }
    ],
    "explicitMappings": [
        {
            "dictionary": "invoice_number",
            "value": "#{PAYLOAD.resolve('$.inv_number')}"
        },
        {
          "dictionary": "extractor",
          "value": "#{ 'APIRO PDF native' }"
        },
        {
            "dictionary": "full_json",
            "value": "#{PAYLOAD.resolve('$')}"
        },
        {
          "dictionary": "receiver",
          "value": "#{PAYLOAD.resolve('$.billee.name')}"
        },
        {
          "dictionary": "total_amount",
          "value": "#{ PAYLOAD.resolve('$.total')?.replace('$','') }"
        }
    ]
}
]]>
            </config>
        </dataFeed>
    </dataFeeds>
</apiroConf>

Alernatively, a PDF TEXT EXTRACT transformer can be defined within the transformers JsonArray within the data feed config. An example is given below:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
<apiroConf version="1" xmlns="http://apiro.com/apiro/v1/root">
    <loadOrder>20</loadOrder>
    <envProperties>
    </envProperties>
    <dataFeeds>
        <dataFeed definition="EXPR_JSON_FEED2" name="PDF_EXTRACT">
            <execPriority>10</execPriority>
            <enabled>true</enabled>
            <push>false</push>
            <pull>true</pull>
            <schema>PDF_EXTRACT</schema>
            <config><![CDATA[
{
  "dataSource": {
    "entity": "GIT",
    "config": {
      "password": "${SYS:TESTFEED_GIT_PASSWORD}",
      "gitURL": "https://github.com/redapiro/apiro_engine_test_feeds.git",
      "branch": "rudtest",
      "pathPrefix": "/rudtest/energybills/petros.pdf",
      "username": "apirobot",
      "transformers": [
        {
          "name": "PDF_TO_TEXT",
          "entity": "PDF_TEXT_EXTRACT",
          "config": {
            "positionalSort": false
          }
        }
      ]
    }
  },
  "explicitMappings": [
      {
      "dictionary": "full_json",
      "value": "#{PAYLOAD.resolve('$')}"
    },
    {
      "dictionary": "manual_review",
      "value": "#{PAYLOAD.resolve('$.manual_review')}"
    },
    {
      "dictionary": "firstname",
      "value": "#{PAYLOAD.resolve('$.firstname')}"
    },
    {
      "dictionary": "lastname",
      "value": "#{PAYLOAD.resolve('$.lastname')}"
    },
    {
      "dictionary": "service_address",
      "value": "#{PAYLOAD.resolve('$.service_address')}"
    }
  ]
}
]]>
            </config>
        </dataFeed>
    </dataFeeds>
</apiroConf>

Here is a concise portion of the above example, including only the direct structure of the transformer:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
{
  "transformers": [
    {
      "name": "PDF_TO_TEXT",
      "entity": "PDF_TEXT_EXTRACT",
      "config": {
        "positionalSort": false
      }
    }
  ]
}

Common Mistakes

  • Incorrect Data Source Configuration: Ensure that the dataSource configuration, including entity type and specific settings, is correctly set up to access your data source.
  • Misconfigured Elements: Verify that the elements array is correctly defined to match the structure of the PDF document you are processing.
  • Incorrect Mappings: Check that the explicitMappings are correctly defined to map extracted data fields to dictionary entries.
  • Security Concerns: Be cautious with the use of sensitive information, such as passwords, in your configuration files.
  • Incorrect Positional Sort Setting: Ensure that the positionalSort setting is correctly configured based on whether you need the extracted data to be sorted by its position in the document.