diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..2abf704
Binary files /dev/null and b/.DS_Store differ
diff --git a/API.md b/API.md
index 79e94bd..44dea85 100644
--- a/API.md
+++ b/API.md
@@ -3525,6 +3525,234 @@ public readonly configuratorFunction: IFunction;
---
+### TextractComprehendMedical
+
+This construct takes in a manifest definition or a plain JSON with a s3Path:.
+
+example s3Path:
+{"s3Path": "s3://bucketname/prefix/image.png"}
+
+
+Then it generated the numberOfPages attribute and the mime on the context.
+The mime types checked against the supported mime types for Textract and if fails, will raise an Exception failing the workflow.
+
+Example (Python)
+```python
+decider_task_id = tcdk.TextractPOCDecider(
+self,
+f"InsuranceDecider",
+)
+```
+
+#### Initializers
+
+```typescript
+import { TextractComprehendMedical } from 'amazon-textract-idp-cdk-constructs'
+
+new TextractComprehendMedical(parent: Construct, id: string, props: TextractComprehendMedicalProps)
+```
+
+| **Name** | **Type** | **Description** |
+| --- | --- | --- |
+| parent
| constructs.Construct
| *No description.* |
+| id
| string
| Descriptive identifier for this chainable. |
+| props
| TextractComprehendMedicalProps
| *No description.* |
+
+---
+
+##### `parent`Required
+
+- *Type:* constructs.Construct
+
+---
+
+##### `id`Required
+
+- *Type:* string
+
+Descriptive identifier for this chainable.
+
+---
+
+##### `props`Required
+
+- *Type:* TextractComprehendMedicalProps
+
+---
+
+#### Methods
+
+| **Name** | **Description** |
+| --- | --- |
+| toString
| Returns a string representation of this construct. |
+| next
| Continue normal execution with the given state. |
+| prefixStates
| Prefix the IDs of all states in this state machine fragment. |
+| toSingleState
| Wrap all states in this state machine fragment up into a single state. |
+
+---
+
+##### `toString`
+
+```typescript
+public toString(): string
+```
+
+Returns a string representation of this construct.
+
+##### `next`
+
+```typescript
+public next(next: IChainable): Chain
+```
+
+Continue normal execution with the given state.
+
+###### `next`Required
+
+- *Type:* aws-cdk-lib.aws_stepfunctions.IChainable
+
+---
+
+##### `prefixStates`
+
+```typescript
+public prefixStates(prefix?: string): StateMachineFragment
+```
+
+Prefix the IDs of all states in this state machine fragment.
+
+Use this to avoid multiple copies of the state machine all having the
+same state IDs.
+
+###### `prefix`Optional
+
+- *Type:* string
+
+The prefix to add.
+
+Will use construct ID by default.
+
+---
+
+##### `toSingleState`
+
+```typescript
+public toSingleState(options?: SingleStateOptions): Parallel
+```
+
+Wrap all states in this state machine fragment up into a single state.
+
+This can be used to add retry or error handling onto this state
+machine fragment.
+
+Be aware that this changes the result of the inner state machine
+to be an array with the result of the state machine in it. Adjust
+your paths accordingly. For example, change 'outputPath' to
+'$[0]'.
+
+###### `options`Optional
+
+- *Type:* aws-cdk-lib.aws_stepfunctions.SingleStateOptions
+
+---
+
+#### Static Functions
+
+| **Name** | **Description** |
+| --- | --- |
+| isConstruct
| Checks if `x` is a construct. |
+
+---
+
+##### ~~`isConstruct`~~
+
+```typescript
+import { TextractComprehendMedical } from 'amazon-textract-idp-cdk-constructs'
+
+TextractComprehendMedical.isConstruct(x: any)
+```
+
+Checks if `x` is a construct.
+
+###### `x`Required
+
+- *Type:* any
+
+Any object.
+
+---
+
+#### Properties
+
+| **Name** | **Type** | **Description** |
+| --- | --- | --- |
+| node
| constructs.Node
| The tree node. |
+| endStates
| aws-cdk-lib.aws_stepfunctions.INextable[]
| The states to chain onto if this fragment is used. |
+| id
| string
| Descriptive identifier for this chainable. |
+| startState
| aws-cdk-lib.aws_stepfunctions.State
| The start state of this state machine fragment. |
+| textractComprehendMedicalFunction
| aws-cdk-lib.aws_lambda.IFunction
| *No description.* |
+
+---
+
+##### `node`Required
+
+```typescript
+public readonly node: Node;
+```
+
+- *Type:* constructs.Node
+
+The tree node.
+
+---
+
+##### `endStates`Required
+
+```typescript
+public readonly endStates: INextable[];
+```
+
+- *Type:* aws-cdk-lib.aws_stepfunctions.INextable[]
+
+The states to chain onto if this fragment is used.
+
+---
+
+##### `id`Required
+
+```typescript
+public readonly id: string;
+```
+
+- *Type:* string
+
+Descriptive identifier for this chainable.
+
+---
+
+##### `startState`Required
+
+```typescript
+public readonly startState: State;
+```
+
+- *Type:* aws-cdk-lib.aws_stepfunctions.State
+
+The start state of this state machine fragment.
+
+---
+
+##### `textractComprehendMedicalFunction`Required
+
+```typescript
+public readonly textractComprehendMedicalFunction: IFunction;
+```
+
+- *Type:* aws-cdk-lib.aws_lambda.IFunction
+
+---
+
+
### TextractGenerateCSV
Generates a output based on Textract Forms and Queries. Supported output_types: "LINES" | "CSV".
@@ -8241,6 +8469,130 @@ public readonly lambdaTimeout: number;
---
+### TextractComprehendMedicalProps
+
+#### Initializer
+
+```typescript
+import { TextractComprehendMedicalProps } from 'amazon-textract-idp-cdk-constructs'
+
+const textractComprehendMedicalProps: TextractComprehendMedicalProps = { ... }
+```
+
+#### Properties
+
+| **Name** | **Type** | **Description** |
+| --- | --- | --- |
+| comprehendMedicalJobType
| string
| *No description.* |
+| comprehendMedicalRoleName
| string
| *No description.* |
+| inputPolicyStatements
| aws-cdk-lib.aws_iam.PolicyStatement[]
| List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. |
+| lambdaLogLevel
| string
| *No description.* |
+| lambdaMemoryMB
| number
| memory of Lambda function (may need to increase for larger documents). |
+| lambdaTimeout
| number
| *No description.* |
+| s3InputBucket
| string
| *No description.* |
+| s3InputPrefix
| string
| prefix for the incoming document. |
+| textractComprehendMedicalFunction
| aws-cdk-lib.aws_lambda.IFunction
| *No description.* |
+
+---
+
+##### `comprehendMedicalJobType`Optional
+
+```typescript
+public readonly comprehendMedicalJobType: string;
+```
+
+- *Type:* string
+
+---
+
+##### `comprehendMedicalRoleName`Optional
+
+```typescript
+public readonly comprehendMedicalRoleName: string;
+```
+
+- *Type:* string
+
+---
+
+##### `inputPolicyStatements`Optional
+
+```typescript
+public readonly inputPolicyStatements: PolicyStatement[];
+```
+
+- *Type:* aws-cdk-lib.aws_iam.PolicyStatement[]
+
+List of PolicyStatements to attach to the Lambda function for S3 GET and LIST.
+
+---
+
+##### `lambdaLogLevel`Optional
+
+```typescript
+public readonly lambdaLogLevel: string;
+```
+
+- *Type:* string
+
+---
+
+##### `lambdaMemoryMB`Optional
+
+```typescript
+public readonly lambdaMemoryMB: number;
+```
+
+- *Type:* number
+
+memory of Lambda function (may need to increase for larger documents).
+
+---
+
+##### `lambdaTimeout`Optional
+
+```typescript
+public readonly lambdaTimeout: number;
+```
+
+- *Type:* number
+
+---
+
+##### `s3InputBucket`Optional
+
+```typescript
+public readonly s3InputBucket: string;
+```
+
+- *Type:* string
+
+---
+
+##### `s3InputPrefix`Optional
+
+```typescript
+public readonly s3InputPrefix: string;
+```
+
+- *Type:* string
+
+prefix for the incoming document.
+
+Will be used to create role
+
+---
+
+##### `textractComprehendMedicalFunction`Optional
+
+```typescript
+public readonly textractComprehendMedicalFunction: IFunction;
+```
+
+- *Type:* aws-cdk-lib.aws_lambda.IFunction
+
+---
+
### TextractDPPOCDeciderProps
#### Initializer
diff --git a/lambda/.DS_Store b/lambda/.DS_Store
new file mode 100644
index 0000000..eb78fcd
Binary files /dev/null and b/lambda/.DS_Store differ
diff --git a/lambda/textract_comprehend_medical/Dockerfile b/lambda/textract_comprehend_medical/Dockerfile
new file mode 100644
index 0000000..d00d9a0
--- /dev/null
+++ b/lambda/textract_comprehend_medical/Dockerfile
@@ -0,0 +1,10 @@
+FROM public.ecr.aws/lambda/python:3.9-x86_64
+RUN /var/lang/bin/python -m pip install --upgrade pip
+COPY app/requirements.txt ${LAMBDA_TASK_ROOT}/
+RUN python -m pip install -r ${LAMBDA_TASK_ROOT}/requirements.txt --target "${LAMBDA_TASK_ROOT}"
+
+# Copy function code
+COPY app/* ${LAMBDA_TASK_ROOT}/
+
+# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
+CMD [ "main.handler" ]
diff --git a/lambda/textract_comprehend_medical/app/__init__.py b/lambda/textract_comprehend_medical/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lambda/textract_comprehend_medical/app/entry.sh b/lambda/textract_comprehend_medical/app/entry.sh
new file mode 100644
index 0000000..3f144fa
--- /dev/null
+++ b/lambda/textract_comprehend_medical/app/entry.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then
+ exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m main $1
+else
+ exec /usr/local/bin/python -m main $1
+fi
diff --git a/lambda/textract_comprehend_medical/app/main.py b/lambda/textract_comprehend_medical/app/main.py
new file mode 100644
index 0000000..66a9cd8
--- /dev/null
+++ b/lambda/textract_comprehend_medical/app/main.py
@@ -0,0 +1,73 @@
+import json
+import uuid
+import boto3
+import os
+import traceback
+import trp
+import logging
+from urllib.parse import urlparse
+
+logger = logging.getLogger('SendToComprehendMedical')
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(getattr(logging, os.getenv('LOG_LEVEL', 'INFO')))
+client = boto3.client('s3')
+cm_client = boto3.client('comprehendmedical')
+cm_job_types = [('ICD10', 'start_icd10_cm_inference_job'),
+ ('SNOMEDCT', 'start_snomedct_inference_job'),
+ ('RXNORM', 'start_rx_norm_inference_job'),
+ ('DETECT_ENTITIES_V2', 'start_entities_detection_v2_job'),
+ ('DETECT_PHI', 'start_phi_detection_job')]
+
+
+# Broken out into separate Lambda function in case there are slow-downs in textract,
+# the document is very large, or in general if the time needed for Textract exceeds
+# the 15-minute limit of Lambda. Also, no need to leave a Lambda function running
+# while it actively polls Textract when Textract is perfectly happy notifying us on
+# SNS when it's ready for us to come back :-)
+
+def handler(event, context):
+ start_job = None
+ job_type = os.getenv('COMPREHEND_MEDICAL_JOB_TYPE')
+ for job in cm_job_types:
+ if job_type == job[0]:
+ start_job = getattr(cm_client, job[1])
+ if not start_job:
+ logger.info('There is no valid COMPREHEND_MEDICAL_JOB_TYPE set.')
+ return
+
+ try:
+ if event.get('textract_result'):
+ output_json = event['textract_result']['TextractOutputJsonPath']
+ bucket = urlparse(output_json).hostname
+ object_key = urlparse(output_json).path[1:]
+ logger.debug(f'Bucket: {bucket}')
+ logger.debug(f'Key: {object_key}')
+ resp = client.get_object(Bucket=bucket, Key=object_key)
+ blocks = json.loads(resp['Body'].read())
+ document = trp.Document(blocks)
+ logger.info(f'The document has {len(document.pages)} pages')
+ # TODO We can add Bedrock here to send multiple pages to CM based on the context
+ text_content = ""
+ job_name = f'job-{uuid.uuid4()}'
+ object_name = f'textract-output/text/{job_type}/{job_name}/{job_name}.txt'
+ for page in document.pages:
+ text_content += page.text
+ client.put_object(Bucket=bucket, Key=object_name, Body=str.encode(text_content))
+ start_job(
+ InputDataConfig={
+ 'S3Bucket': bucket,
+ 'S3Key': f'textract-output/text/{job_type}/{job_name}'
+ },
+ OutputDataConfig={
+ 'S3Bucket': bucket,
+ 'S3Key': f'cm-output/json/{job_type}/{job_name}'
+ },
+ JobName=job_name,
+ DataAccessRoleArn=os.getenv('COMPREHEND_MEDICAL_ROLE'),
+ LanguageCode='en'
+ )
+ else:
+ raise RuntimeError('Invalid lambda event.')
+ except Exception as e:
+ traceback.print_exc()
+ raise e
\ No newline at end of file
diff --git a/lambda/textract_comprehend_medical/app/requirements.txt b/lambda/textract_comprehend_medical/app/requirements.txt
new file mode 100644
index 0000000..9128fc3
--- /dev/null
+++ b/lambda/textract_comprehend_medical/app/requirements.txt
@@ -0,0 +1,2 @@
+requests
+requests_auth_aws_sigv4
diff --git a/lambda/textract_comprehend_medical/app/trp.py b/lambda/textract_comprehend_medical/app/trp.py
new file mode 100644
index 0000000..ee7e8cc
--- /dev/null
+++ b/lambda/textract_comprehend_medical/app/trp.py
@@ -0,0 +1,700 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+class BoundingBox:
+ def __init__(self, width, height, left, top):
+ self._width = width
+ self._height = height
+ self._left = left
+ self._top = top
+
+ def __str__(self):
+ return "width: {}, height: {}, left: {}, top: {}".format(self._width, self._height, self._left, self._top)
+
+ @property
+ def width(self):
+ return self._width
+
+ @property
+ def height(self):
+ return self._height
+
+ @property
+ def left(self):
+ return self._left
+
+ @property
+ def top(self):
+ return self._top
+
+
+class Polygon:
+ def __init__(self, x, y):
+ self._x = x
+ self._y = y
+
+ def __str__(self):
+ return "x: {}, y: {}".format(self._x, self._y)
+
+ @property
+ def x(self):
+ return self._x
+
+ @property
+ def y(self):
+ return self._y
+
+
+class Geometry:
+ def __init__(self, geometry):
+ bounding_box = geometry["BoundingBox"]
+ polygon = geometry["Polygon"]
+ bb = BoundingBox(bounding_box["Width"], bounding_box["Height"], bounding_box["Left"], bounding_box["Top"])
+ pgs = []
+ for pg in polygon:
+ pgs.append(Polygon(pg["X"], pg["Y"]))
+
+ self._bounding_box = bb
+ self._polygon = pgs
+
+ def __str__(self):
+ s = "BoundingBox: {}\n".format(str(self._bounding_box))
+ return s
+
+ @property
+ def bounding_box(self):
+ return self._bounding_box
+
+ @property
+ def polygon(self):
+ return self._polygon
+
+
+class Word:
+ def __init__(self, block, block_map):
+ self._block = block
+ self._confidence = block['Confidence']
+ self._geometry = Geometry(block['Geometry'])
+ self._id = block['Id']
+ self._text = ""
+ if block['Text']:
+ self._text = block['Text']
+
+ def __str__(self):
+ return self._text
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def text(self):
+ return self._text
+
+ @property
+ def block(self):
+ return self._block
+
+
+class Line:
+ def __init__(self, block, block_map):
+
+ self._block = block
+ self._confidence = block['Confidence']
+ self._geometry = Geometry(block['Geometry'])
+ self._id = block['Id']
+
+ self._text = ""
+ if block['Text']:
+ self._text = block['Text']
+
+ self._words = []
+ if 'Relationships' in block and block['Relationships']:
+ for rs in block['Relationships']:
+ if rs['Type'] == 'CHILD':
+ for cid in rs['Ids']:
+ if cid not in block_map:
+ continue
+ if block_map[cid]["BlockType"] == "WORD":
+ self._words.append(Word(block_map[cid], block_map))
+
+ def __str__(self):
+ s = "Line\n==========\n"
+ s = s + self._text + "\n"
+ s = s + "Words\n----------\n"
+ for word in self._words:
+ s = s + "[{}]".format(str(word))
+ return s
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def words(self):
+ return self._words
+
+ @property
+ def text(self):
+ return self._text
+
+ @property
+ def block(self):
+ return self._block
+
+
+class SelectionElement:
+ def __init__(self, block, block_map):
+ self._confidence = block['Confidence']
+ self._geometry = Geometry(block['Geometry'])
+ self._id = block['Id']
+ self._selectionStatus = block['SelectionStatus']
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def selection_status(self):
+ return self._selectionStatus
+
+
+class FieldKey:
+ def __init__(self, block, children, block_map):
+ self._block = block
+ self._confidence = block['Confidence']
+ self._geometry = Geometry(block['Geometry'])
+ self._id = block['Id']
+ self._text = ""
+ self._content = []
+
+ t = []
+
+ for eid in children:
+ wb = block_map[eid]
+ if wb['BlockType'] == "WORD":
+ w = Word(wb, block_map)
+ self._content.append(w)
+ t.append(w.text)
+
+ if t:
+ self._text = ' '.join(t)
+
+ def __str__(self):
+ return self._text
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def content(self):
+ return self._content
+
+ @property
+ def text(self):
+ return self._text
+
+ @property
+ def block(self):
+ return self._block
+
+
+class FieldValue:
+ def __init__(self, block, children, block_map):
+ self._block = block
+ self._confidence = block['Confidence']
+ self._geometry = Geometry(block['Geometry'])
+ self._id = block['Id']
+ self._text = ""
+ self._content = []
+
+ t = []
+
+ for eid in children:
+ wb = block_map[eid]
+ if wb['BlockType'] == "WORD":
+ w = Word(wb, block_map)
+ self._content.append(w)
+ t.append(w.text)
+ elif wb['BlockType'] == "SELECTION_ELEMENT":
+ se = SelectionElement(wb, block_map)
+ self._content.append(se)
+ self._text = se.selection_status
+
+ if t:
+ self._text = ' '.join(t)
+
+ def __str__(self):
+ return self._text
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def content(self):
+ return self._content
+
+ @property
+ def text(self):
+ return self._text
+
+ @property
+ def block(self):
+ return self._block
+
+
+class Field:
+ def __init__(self, block, block_map):
+ self._key = None
+ self._value = None
+
+ for item in block['Relationships']:
+ if item["Type"] == "CHILD":
+ self._key = FieldKey(block, item['Ids'], block_map)
+ elif item["Type"] == "VALUE":
+ for eid in item['Ids']:
+ vkvs = block_map[eid]
+ if 'VALUE' in vkvs['EntityTypes']:
+ if 'Relationships' in vkvs:
+ for v_item in vkvs['Relationships']:
+ if v_item["Type"] == "CHILD":
+ self._value = FieldValue(vkvs, v_item['Ids'], block_map)
+
+ def __str__(self):
+ s = "\nField\n==========\n"
+ k = ""
+ v = ""
+ if self._key:
+ k = str(self._key)
+ if self._value:
+ v = str(self._value)
+ s = s + "Key: {}\nValue: {}".format(k, v)
+ return s
+
+ @property
+ def key(self):
+ return self._key
+
+ @property
+ def value(self):
+ return self._value
+
+
+class Form:
+ def __init__(self):
+ self._fields = []
+ self._fields_map = {}
+
+ def add_field(self, field):
+ self._fields.append(field)
+ self._fields_map[field.key.text] = field
+
+ def __str__(self):
+ s = ""
+ for field in self._fields:
+ s = s + str(field) + "\n"
+ return s
+
+ @property
+ def fields(self):
+ return self._fields
+
+ def get_field_by_key(self, key):
+ field = None
+ if key in self._fields_map:
+ field = self._fields_map[key]
+ return field
+
+ def search_fields_by_key(self, key):
+ search_key = key.lower()
+ results = []
+ for field in self._fields:
+ if field.key and search_key in field.key.text.lower():
+ results.append(field)
+ return results
+
+
+class Cell:
+
+ def __init__(self, block, block_map):
+ self._block = block
+ self._confidence = block['Confidence']
+ self._rowIndex = block['RowIndex']
+ self._columnIndex = block['ColumnIndex']
+ self._rowSpan = block['RowSpan']
+ self._columnSpan = block['ColumnSpan']
+ self._geometry = Geometry(block['Geometry'])
+ self._id = block['Id']
+ self._content = []
+ self._text = ""
+ if 'Relationships' in block and block['Relationships']:
+ for rs in block['Relationships']:
+ if rs['Type'] == 'CHILD':
+ for cid in rs['Ids']:
+ block_type = block_map[cid]["BlockType"]
+ if block_type == "WORD":
+ w = Word(block_map[cid], block_map)
+ self._content.append(w)
+ self._text = self._text + w.text + ' '
+ elif block_type == "SELECTION_ELEMENT":
+ se = SelectionElement(block_map[cid], block_map)
+ self._content.append(se)
+ self._text = self._text + se.selection_status + ', '
+
+ def __str__(self):
+ return self._text
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def row_index(self):
+ return self._rowIndex
+
+ @property
+ def column_index(self):
+ return self._columnIndex
+
+ @property
+ def row_span(self):
+ return self._rowSpan
+
+ @property
+ def column_span(self):
+ return self._columnSpan
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def content(self):
+ return self._content
+
+ @property
+ def text(self):
+ return self._text
+
+ @property
+ def block(self):
+ return self._block
+
+
+class Row:
+ def __init__(self):
+ self._cells = []
+
+ def __str__(self):
+ s = ""
+ for cell in self._cells:
+ s = s + "[{}]".format(str(cell))
+ return s
+
+ @property
+ def cells(self):
+ return self._cells
+
+
+class Table:
+
+ def __init__(self, block, block_map):
+
+ self._block = block
+
+ self._confidence = block['Confidence']
+ self._geometry = Geometry(block['Geometry'])
+
+ self._id = block['Id']
+ self._rows = []
+
+ ri = 1
+ row = Row()
+ if 'Relationships' in block and block['Relationships']:
+ for rs in block['Relationships']:
+ if rs['Type'] == 'CHILD':
+ for cid in rs['Ids']:
+ cell = Cell(block_map[cid], block_map)
+ if cell.row_index > ri:
+ self._rows.append(row)
+ row = Row()
+ ri = cell.row_index
+ row.cells.append(cell)
+ if row and row.cells:
+ self._rows.append(row)
+
+ def __str__(self):
+ s = "Table\n==========\n"
+ for row in self._rows:
+ s = s + "Row\n==========\n"
+ s = s + str(row) + "\n"
+ return s
+
+ @property
+ def confidence(self):
+ return self._confidence
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def rows(self):
+ return self._rows
+
+ @property
+ def block(self):
+ return self._block
+
+
+class Page:
+
+ def __init__(self, blocks, block_map):
+ self._blocks = blocks
+ self._text = ""
+ self._lines = []
+ self._form = Form()
+ self._tables = []
+ self._selection_element = []
+ self._content = []
+ self._orientation = True
+
+ self._parse(block_map)
+
+ def __str__(self):
+ s = "Page\n==========\n"
+ for item in self._content:
+ s = s + str(item) + "\n"
+ return s
+
+ def _parse(self, block_map):
+ for item in self._blocks:
+ if item["BlockType"] == "PAGE":
+ self._geometry = Geometry(item['Geometry'])
+ self._id = item['Id']
+ self._orientation = item['Orientation']
+ elif item["BlockType"] == "LINE":
+ line_item = Line(item, block_map)
+ self._lines.append(line_item)
+ self._content.append(line_item)
+ self._text = self._text + line_item.text + '\n'
+ elif item["BlockType"] == "TABLE":
+ table_item = Table(item, block_map)
+ self._tables.append(table_item)
+ self._content.append(table_item)
+ elif item["BlockType"] == "SELECTION_ELEMENT":
+ selection_item = SelectionElement(item, block_map)
+ self._selection_element.append(selection_item)
+ self._content.append(selection_item)
+ elif item["BlockType"] == "KEY_VALUE_SET":
+ if 'KEY' in item['EntityTypes']:
+ f = Field(item, block_map)
+ if f.key:
+ self._form.add_field(f)
+ self._content.append(f)
+ else:
+ print("WARNING: Detected K/V where key does not have content. Excluding key from output.")
+ print(f)
+ print(item)
+
+ def get_lines_in_reading_order(self):
+ columns = []
+ lines = []
+ for item in self._lines:
+ column_found = False
+ for index, column in enumerate(columns):
+ bbox_left = item.geometry.bounding_box.left
+ bbox_right = item.geometry.bounding_box.left + item.geometry.bounding_box.width
+ bbox_centre = item.geometry.bounding_box.left + item.geometry.bounding_box.width / 2
+ column_centre = column['left'] + column['right'] / 2
+ if column['left'] < bbox_centre < column['right'] or bbox_left < column_centre < bbox_right:
+ # Bbox appears inside the column
+ lines.append([index, item])
+ column_found = True
+ break
+ if not column_found:
+ columns.append({'left': item.geometry.bounding_box.left,
+ 'right': item.geometry.bounding_box.left + item.geometry.bounding_box.width})
+ lines.append([len(columns) - 1, item])
+
+ lines.sort(key=lambda x: x[0])
+ return lines
+
+ def get_text_in_reading_order(self):
+ lines = self.get_lines_in_reading_order()
+ text = ""
+ for line in lines:
+ text = text + line[1] + '\n'
+ return text
+
+ @property
+ def blocks(self):
+ return self._blocks
+
+ @property
+ def text(self):
+ return self._text
+
+ @property
+ def lines(self):
+ return self._lines
+
+ @property
+ def form(self):
+ return self._form
+
+ @property
+ def tables(self):
+ return self._tables
+
+ @property
+ def selections(self):
+ return self._selection_element
+
+ @property
+ def content(self):
+ return self._content
+
+ @property
+ def geometry(self):
+ return self._geometry
+
+ @property
+ def id(self):
+ return self._id
+
+ @property
+ def orientation(self):
+ return self._orientation
+
+
+class Document:
+
+ def __init__(self, response_pages):
+
+ if not isinstance(response_pages, list):
+ rps = [response_pages]
+ response_pages = rps
+
+ self._responsePages = response_pages
+ self._pages = []
+
+ self._parse()
+
+ def __str__(self):
+ s = "\nDocument\n==========\n"
+ for p in self._pages:
+ s = s + str(p) + "\n\n"
+ return s
+
+ def _parse_document_pages_and_block_map(self):
+
+ block_map = {}
+
+ document_pages = []
+ document_page = None
+ for page in self._responsePages:
+ for block in page['Blocks']:
+ if 'BlockType' in block and 'Id' in block:
+ block_map[block['Id']] = block
+
+ if block['BlockType'] == 'PAGE':
+ if block['Geometry']['Polygon'][0]['X'] > block['Geometry']['Polygon'][1]['X']:
+ block['Orientation'] = False
+ else:
+ block['Orientation'] = True
+ if document_page:
+ document_pages.append({"Blocks": document_page})
+ document_page = [block]
+ else:
+ if document_page:
+ document_page.append(block)
+ if document_page:
+ document_pages.append({"Blocks": document_page})
+ return document_pages, block_map
+
+ def _parse(self):
+
+ self._response_document_pages, self._block_map = self._parse_document_pages_and_block_map()
+ for documentPage in self._response_document_pages:
+ page = Page(documentPage["Blocks"], self._block_map)
+ self._pages.append(page)
+
+ @property
+ def blocks(self):
+ return self._responsePages
+
+ @property
+ def page_blocks(self):
+ return self._response_document_pages
+
+ @property
+ def pages(self):
+ return self._pages
+
+ def get_block_by_id(self, block_id):
+ block = None
+ if self._block_map and block_id in self._block_map:
+ block = self._block_map[block_id]
+ return block
diff --git a/lambda/textract_comprehend_medical/env.json b/lambda/textract_comprehend_medical/env.json
new file mode 100644
index 0000000..ad1bb75
--- /dev/null
+++ b/lambda/textract_comprehend_medical/env.json
@@ -0,0 +1,7 @@
+{
+ "Parameters": {
+ "SNS_ARN": "arn:aws:sns:us-east-2:123456789012:Textract",
+ "ROLE_ARN": "arn:aws:iam::123456789012:role/Role_Textract",
+ "LOG_LEVEL": "DEBUG"
+ }
+}
diff --git a/lambda/textract_comprehend_medical/events/event.json b/lambda/textract_comprehend_medical/events/event.json
new file mode 100644
index 0000000..c86f84b
--- /dev/null
+++ b/lambda/textract_comprehend_medical/events/event.json
@@ -0,0 +1,14 @@
+{
+ "manifest":
+ {
+ "s3Path": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/uploads/fax-190517121115-6784593217-11_Redacted.pdf"
+ },
+ "mime": "application/pdf",
+ "classification": null,
+ "numberOfPages": 2,
+ "textract_result":
+ {
+ "TextractTempOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/temp/a8cdb100b8414882978cef68cf49c7ed1e47a68680ffa9fd442a69ae8c05bf4c",
+ "TextractOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/textract-output/fax-190517121115-6784593217-11_Redacted2023-04-19T18:49:29.225383/fax-190517121115-6784593217-11_Redacted.json"
+ }
+}
\ No newline at end of file
diff --git a/lambda/textract_comprehend_medical/template.yaml b/lambda/textract_comprehend_medical/template.yaml
new file mode 100644
index 0000000..496fd45
--- /dev/null
+++ b/lambda/textract_comprehend_medical/template.yaml
@@ -0,0 +1,27 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Transform: AWS::Serverless-2016-10-31
+Description: >
+ python3.9
+
+ Lambda function for PDF Mapper for FHIR
+
+Globals:
+ Function:
+ Timeout: 900
+
+Resources:
+ PdfMapperForFhirFunction:
+ Type: AWS::Serverless::Function
+ Properties:
+ PackageType: Image
+ Architectures:
+ - x86_64
+ Environment:
+ Variables:
+ HEALTHLAKE_ENDPOINT: https://healthlake.us-east-2.amazonaws.com/datastore/83c10afe8566667ac2489c8d989b2c14/r4/
+ LOG_LEVEL: INFO
+ Metadata:
+ Dockerfile: Dockerfile
+ DockerContext: .
+ DockerTag: python3.9-v1
+
diff --git a/lambda/textract_comprehend_medical/test_sam_local.sh b/lambda/textract_comprehend_medical/test_sam_local.sh
new file mode 100755
index 0000000..4c4e983
--- /dev/null
+++ b/lambda/textract_comprehend_medical/test_sam_local.sh
@@ -0,0 +1,2 @@
+sam build
+sam local invoke -n env.json -e events/event.json
diff --git a/lambda/textract_comprehend_medical/tests/data/sample_manifest.json b/lambda/textract_comprehend_medical/tests/data/sample_manifest.json
new file mode 100644
index 0000000..7dc98a5
--- /dev/null
+++ b/lambda/textract_comprehend_medical/tests/data/sample_manifest.json
@@ -0,0 +1,14 @@
+{
+ "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
+ "textractFeatures": [
+ "FORMS",
+ "TABLES",
+ "QUERIES"
+ ],
+ "queriesConfig": [{
+ "text": "What is the applicant full name?",
+ "alias": "FULL_NAME",
+ "pages": "[*]"
+ }],
+ "classification": "APPLICATION",
+}
diff --git a/lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json b/lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json
new file mode 100644
index 0000000..61b5e4c
--- /dev/null
+++ b/lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json
@@ -0,0 +1,13 @@
+{
+ "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
+ "textractFeatures": [
+ "FORMS",
+ "TABLES",
+ "QUERIES"
+ ],
+ "queries_config": [{
+ "text": "What is the applicant full name?",
+ "alias": "FULL_NAME",
+ "pages": "[*]"
+ }]
+}
diff --git a/lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py b/lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py
new file mode 100644
index 0000000..1977b4f
--- /dev/null
+++ b/lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py
@@ -0,0 +1,19 @@
+import json
+import io
+import os
+import boto3
+
+current_folder = os.path.dirname(os.path.realpath(__file__))
+
+
+def test_serializer_manifest(caplog):
+ s3_bucket = 'sdx-textract-us-east-1'
+ s3_key = 'sample_manifest.json'
+ s3_client = boto3.client('s3')
+ o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key)
+ file_content = o.get('Body').read().decode('utf-8')
+ json_content = json.loads(file_content)
+ assert json_content
+ assert json_content['s3Path']
+ assert json_content['textractFeatures']
+ assert len(json_content['textractFeatures']) == 3
diff --git a/src/index.ts b/src/index.ts
index c3cca88..fd4fe03 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -26,6 +26,7 @@ export { WorkmailS3IngestionPoint, WorkmailS3IngestionPointProps } from './workm
export { TextractPdfMapperForFhir, TextractPdfMapperForFhirProps } from './textractPdfMapperForFhir';
export { SearchablePDF, SearchablePDFProps } from './searchablePDF';
export { SFExecutionsStartThrottle, SFExecutionsStartThrottleProps as sfExecutionsStartThrottleProps } from './stepFunctionsExecutionsStartThrottle';
+export { TextractComprehendMedical, TextractComprehendMedicalProps } from './textractComprehendMedical';
export function validatePatternSupported(integrationPattern: sfn.IntegrationPattern, supportedPatterns: sfn.IntegrationPattern[]) {
if (! supportedPatterns.includes(integrationPattern)) {
diff --git a/src/textractComprehendMedical.ts b/src/textractComprehendMedical.ts
new file mode 100644
index 0000000..a598b9c
--- /dev/null
+++ b/src/textractComprehendMedical.ts
@@ -0,0 +1,150 @@
+import * as path from 'path';
+import { Duration } from 'aws-cdk-lib';
+import * as iam from 'aws-cdk-lib/aws-iam';
+import * as lambda from 'aws-cdk-lib/aws-lambda';
+import * as sfn from 'aws-cdk-lib/aws-stepfunctions';
+import * as tasks from 'aws-cdk-lib/aws-stepfunctions-tasks';
+import { Construct } from 'constructs';
+
+export interface TextractComprehendMedicalProps {
+ /** memory of Lambda function (may need to increase for larger documents) */
+ readonly lambdaMemoryMB?: number;
+ readonly lambdaTimeout?: number;
+ readonly textractComprehendMedicalFunction?: lambda.IFunction;
+ readonly lambdaLogLevel?: string;
+ readonly s3InputBucket?: string;
+ readonly comprehendMedicalRoleName?: string;
+ readonly comprehendMedicalJobType?: string;
+ /** prefix for the incoming document. Will be used to create role */
+ readonly s3InputPrefix?: string;
+ /** List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. */
+ readonly inputPolicyStatements?: iam.PolicyStatement[];
+}
+
+/**
+ * This construct takes in a manifest definition or a plain JSON with a s3Path:
+ *
+ * example s3Path:
+ * {"s3Path": "s3://bucketname/prefix/image.png"}
+ *
+ *
+ * Then it generated the numberOfPages attribute and the mime on the context.
+ * The mime types checked against the supported mime types for Textract and if fails, will raise an Exception failing the workflow.
+ *
+ * Example (Python)
+ * ```python
+ decider_task_id = tcdk.TextractPOCDecider(
+ self,
+ f"InsuranceDecider",
+ )
+ ```
+
+ *
+ */
+export class TextractComprehendMedical extends sfn.StateMachineFragment {
+ public readonly startState: sfn.State;
+ public readonly endStates: sfn.INextable[];
+ public readonly textractComprehendMedicalFunction: lambda.IFunction;
+
+ constructor(parent: Construct, id: string, props: TextractComprehendMedicalProps) {
+ super(parent, id);
+
+ const lambdaMemoryMB = props.lambdaMemoryMB === undefined ? 1024 : props.lambdaMemoryMB;
+ const lambdaTimeout = props.lambdaTimeout === undefined ? 900 : props.lambdaTimeout;
+ const lambdaLogLevel = props.lambdaLogLevel === undefined ? 'INFO' : props.lambdaLogLevel;
+ const s3InputPrefix = props.s3InputPrefix === undefined ? 'uploads' : props.s3InputPrefix;
+ const cmJobType = props.comprehendMedicalJobType === undefined ? 'ICD10' : props.comprehendMedicalJobType;
+
+ const comprehendMedicalRole = new iam.Role(this, 'RoleComprehendMedical', {
+ assumedBy: new iam.ServicePrincipal('comprehendmedical.amazonaws.com'),
+ });
+ if (props.s3InputBucket === undefined) {
+ comprehendMedicalRole.addToPolicy(
+ new iam.PolicyStatement({
+ actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'],
+ effect: iam.Effect.ALLOW,
+ resources: ['*'],
+ }),
+ );
+ } else {
+ comprehendMedicalRole.addToPolicy(
+ new iam.PolicyStatement({
+ actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'],
+ effect: iam.Effect.ALLOW,
+ resources: [
+ path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/'),
+ path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/*'),
+ ],
+ }),
+ );
+ }
+
+ this.textractComprehendMedicalFunction = new lambda.DockerImageFunction(
+ this,
+ 'TextractComprehendMedical',
+ {
+ code: lambda.DockerImageCode.fromImageAsset(
+ path.join(__dirname, '../lambda/textract_comprehend_medical/'),
+ ),
+ architecture: lambda.Architecture.X86_64,
+ memorySize: lambdaMemoryMB,
+ timeout: Duration.seconds(lambdaTimeout),
+ environment: {
+ LOG_LEVEL: lambdaLogLevel,
+ COMPREHEND_MEDICAL_ROLE: comprehendMedicalRole.roleArn,
+ COMPREHEND_MEDICAL_JOB_TYPE: cmJobType,
+ },
+ },
+ );
+
+ if (props.inputPolicyStatements === undefined) {
+ if (props.s3InputBucket === undefined) {
+ this.textractComprehendMedicalFunction.addToRolePolicy(
+ new iam.PolicyStatement({
+ actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'],
+ resources: ['*'],
+ }),
+ );
+ } else {
+ this.textractComprehendMedicalFunction.addToRolePolicy(
+ new iam.PolicyStatement({
+ actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'],
+ resources: [
+ path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/'),
+ path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/*'),
+ ],
+ }),
+ );
+ }
+ this.textractComprehendMedicalFunction.addToRolePolicy(
+ new iam.PolicyStatement({
+ actions: ['comprehendmedical:Start*'],
+ resources: ['*'],
+ }),
+ ),
+ this.textractComprehendMedicalFunction.addToRolePolicy(
+ new iam.PolicyStatement({
+ actions: ['iam:PassRole'],
+ resources: ['*'],
+ conditions: {
+ StringEquals: {
+ 'iam:PassedToService': 'comprehendmedical.amazonaws.com',
+ },
+ },
+ },
+ ),
+ );
+ } else {
+ for (var policyStatement of props.inputPolicyStatements) {
+ this.textractComprehendMedicalFunction.addToRolePolicy(policyStatement);
+ }
+ }
+
+ const textractComprehendMedicalLambdaInvoke = new tasks.LambdaInvoke(this, id, {
+ lambdaFunction: this.textractComprehendMedicalFunction,
+ outputPath: '$.Payload',
+ });
+ this.startState = textractComprehendMedicalLambdaInvoke;
+ this.endStates = [textractComprehendMedicalLambdaInvoke];
+ }
+}