diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..2abf704 Binary files /dev/null and b/.DS_Store differ diff --git a/API.md b/API.md index 79e94bd..44dea85 100644 --- a/API.md +++ b/API.md @@ -3525,6 +3525,234 @@ public readonly configuratorFunction: IFunction; --- +### TextractComprehendMedical + +This construct takes in a manifest definition or a plain JSON with a s3Path:. + +example s3Path: +{"s3Path": "s3://bucketname/prefix/image.png"} + + +Then it generated the numberOfPages attribute and the mime on the context. +The mime types checked against the supported mime types for Textract and if fails, will raise an Exception failing the workflow. + +Example (Python) +```python +decider_task_id = tcdk.TextractPOCDecider( +self, +f"InsuranceDecider", +) +``` + +#### Initializers + +```typescript +import { TextractComprehendMedical } from 'amazon-textract-idp-cdk-constructs' + +new TextractComprehendMedical(parent: Construct, id: string, props: TextractComprehendMedicalProps) +``` + +| **Name** | **Type** | **Description** | +| --- | --- | --- | +| parent | constructs.Construct | *No description.* | +| id | string | Descriptive identifier for this chainable. | +| props | TextractComprehendMedicalProps | *No description.* | + +--- + +##### `parent`Required + +- *Type:* constructs.Construct + +--- + +##### `id`Required + +- *Type:* string + +Descriptive identifier for this chainable. + +--- + +##### `props`Required + +- *Type:* TextractComprehendMedicalProps + +--- + +#### Methods + +| **Name** | **Description** | +| --- | --- | +| toString | Returns a string representation of this construct. | +| next | Continue normal execution with the given state. | +| prefixStates | Prefix the IDs of all states in this state machine fragment. | +| toSingleState | Wrap all states in this state machine fragment up into a single state. | + +--- + +##### `toString` + +```typescript +public toString(): string +``` + +Returns a string representation of this construct. + +##### `next` + +```typescript +public next(next: IChainable): Chain +``` + +Continue normal execution with the given state. + +###### `next`Required + +- *Type:* aws-cdk-lib.aws_stepfunctions.IChainable + +--- + +##### `prefixStates` + +```typescript +public prefixStates(prefix?: string): StateMachineFragment +``` + +Prefix the IDs of all states in this state machine fragment. + +Use this to avoid multiple copies of the state machine all having the +same state IDs. + +###### `prefix`Optional + +- *Type:* string + +The prefix to add. + +Will use construct ID by default. + +--- + +##### `toSingleState` + +```typescript +public toSingleState(options?: SingleStateOptions): Parallel +``` + +Wrap all states in this state machine fragment up into a single state. + +This can be used to add retry or error handling onto this state +machine fragment. + +Be aware that this changes the result of the inner state machine +to be an array with the result of the state machine in it. Adjust +your paths accordingly. For example, change 'outputPath' to +'$[0]'. + +###### `options`Optional + +- *Type:* aws-cdk-lib.aws_stepfunctions.SingleStateOptions + +--- + +#### Static Functions + +| **Name** | **Description** | +| --- | --- | +| isConstruct | Checks if `x` is a construct. | + +--- + +##### ~~`isConstruct`~~ + +```typescript +import { TextractComprehendMedical } from 'amazon-textract-idp-cdk-constructs' + +TextractComprehendMedical.isConstruct(x: any) +``` + +Checks if `x` is a construct. + +###### `x`Required + +- *Type:* any + +Any object. + +--- + +#### Properties + +| **Name** | **Type** | **Description** | +| --- | --- | --- | +| node | constructs.Node | The tree node. | +| endStates | aws-cdk-lib.aws_stepfunctions.INextable[] | The states to chain onto if this fragment is used. | +| id | string | Descriptive identifier for this chainable. | +| startState | aws-cdk-lib.aws_stepfunctions.State | The start state of this state machine fragment. | +| textractComprehendMedicalFunction | aws-cdk-lib.aws_lambda.IFunction | *No description.* | + +--- + +##### `node`Required + +```typescript +public readonly node: Node; +``` + +- *Type:* constructs.Node + +The tree node. + +--- + +##### `endStates`Required + +```typescript +public readonly endStates: INextable[]; +``` + +- *Type:* aws-cdk-lib.aws_stepfunctions.INextable[] + +The states to chain onto if this fragment is used. + +--- + +##### `id`Required + +```typescript +public readonly id: string; +``` + +- *Type:* string + +Descriptive identifier for this chainable. + +--- + +##### `startState`Required + +```typescript +public readonly startState: State; +``` + +- *Type:* aws-cdk-lib.aws_stepfunctions.State + +The start state of this state machine fragment. + +--- + +##### `textractComprehendMedicalFunction`Required + +```typescript +public readonly textractComprehendMedicalFunction: IFunction; +``` + +- *Type:* aws-cdk-lib.aws_lambda.IFunction + +--- + + ### TextractGenerateCSV Generates a output based on Textract Forms and Queries. Supported output_types: "LINES" | "CSV". @@ -8241,6 +8469,130 @@ public readonly lambdaTimeout: number; --- +### TextractComprehendMedicalProps + +#### Initializer + +```typescript +import { TextractComprehendMedicalProps } from 'amazon-textract-idp-cdk-constructs' + +const textractComprehendMedicalProps: TextractComprehendMedicalProps = { ... } +``` + +#### Properties + +| **Name** | **Type** | **Description** | +| --- | --- | --- | +| comprehendMedicalJobType | string | *No description.* | +| comprehendMedicalRoleName | string | *No description.* | +| inputPolicyStatements | aws-cdk-lib.aws_iam.PolicyStatement[] | List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. | +| lambdaLogLevel | string | *No description.* | +| lambdaMemoryMB | number | memory of Lambda function (may need to increase for larger documents). | +| lambdaTimeout | number | *No description.* | +| s3InputBucket | string | *No description.* | +| s3InputPrefix | string | prefix for the incoming document. | +| textractComprehendMedicalFunction | aws-cdk-lib.aws_lambda.IFunction | *No description.* | + +--- + +##### `comprehendMedicalJobType`Optional + +```typescript +public readonly comprehendMedicalJobType: string; +``` + +- *Type:* string + +--- + +##### `comprehendMedicalRoleName`Optional + +```typescript +public readonly comprehendMedicalRoleName: string; +``` + +- *Type:* string + +--- + +##### `inputPolicyStatements`Optional + +```typescript +public readonly inputPolicyStatements: PolicyStatement[]; +``` + +- *Type:* aws-cdk-lib.aws_iam.PolicyStatement[] + +List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. + +--- + +##### `lambdaLogLevel`Optional + +```typescript +public readonly lambdaLogLevel: string; +``` + +- *Type:* string + +--- + +##### `lambdaMemoryMB`Optional + +```typescript +public readonly lambdaMemoryMB: number; +``` + +- *Type:* number + +memory of Lambda function (may need to increase for larger documents). + +--- + +##### `lambdaTimeout`Optional + +```typescript +public readonly lambdaTimeout: number; +``` + +- *Type:* number + +--- + +##### `s3InputBucket`Optional + +```typescript +public readonly s3InputBucket: string; +``` + +- *Type:* string + +--- + +##### `s3InputPrefix`Optional + +```typescript +public readonly s3InputPrefix: string; +``` + +- *Type:* string + +prefix for the incoming document. + +Will be used to create role + +--- + +##### `textractComprehendMedicalFunction`Optional + +```typescript +public readonly textractComprehendMedicalFunction: IFunction; +``` + +- *Type:* aws-cdk-lib.aws_lambda.IFunction + +--- + ### TextractDPPOCDeciderProps #### Initializer diff --git a/lambda/.DS_Store b/lambda/.DS_Store new file mode 100644 index 0000000..eb78fcd Binary files /dev/null and b/lambda/.DS_Store differ diff --git a/lambda/textract_comprehend_medical/Dockerfile b/lambda/textract_comprehend_medical/Dockerfile new file mode 100644 index 0000000..d00d9a0 --- /dev/null +++ b/lambda/textract_comprehend_medical/Dockerfile @@ -0,0 +1,10 @@ +FROM public.ecr.aws/lambda/python:3.9-x86_64 +RUN /var/lang/bin/python -m pip install --upgrade pip +COPY app/requirements.txt ${LAMBDA_TASK_ROOT}/ +RUN python -m pip install -r ${LAMBDA_TASK_ROOT}/requirements.txt --target "${LAMBDA_TASK_ROOT}" + +# Copy function code +COPY app/* ${LAMBDA_TASK_ROOT}/ + +# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) +CMD [ "main.handler" ] diff --git a/lambda/textract_comprehend_medical/app/__init__.py b/lambda/textract_comprehend_medical/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lambda/textract_comprehend_medical/app/entry.sh b/lambda/textract_comprehend_medical/app/entry.sh new file mode 100644 index 0000000..3f144fa --- /dev/null +++ b/lambda/textract_comprehend_medical/app/entry.sh @@ -0,0 +1,6 @@ +#!/bin/sh +if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then + exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m main $1 +else + exec /usr/local/bin/python -m main $1 +fi diff --git a/lambda/textract_comprehend_medical/app/main.py b/lambda/textract_comprehend_medical/app/main.py new file mode 100644 index 0000000..66a9cd8 --- /dev/null +++ b/lambda/textract_comprehend_medical/app/main.py @@ -0,0 +1,73 @@ +import json +import uuid +import boto3 +import os +import traceback +import trp +import logging +from urllib.parse import urlparse + +logger = logging.getLogger('SendToComprehendMedical') +logger.addHandler(logging.StreamHandler()) +logger.setLevel(getattr(logging, os.getenv('LOG_LEVEL', 'INFO'))) +client = boto3.client('s3') +cm_client = boto3.client('comprehendmedical') +cm_job_types = [('ICD10', 'start_icd10_cm_inference_job'), + ('SNOMEDCT', 'start_snomedct_inference_job'), + ('RXNORM', 'start_rx_norm_inference_job'), + ('DETECT_ENTITIES_V2', 'start_entities_detection_v2_job'), + ('DETECT_PHI', 'start_phi_detection_job')] + + +# Broken out into separate Lambda function in case there are slow-downs in textract, +# the document is very large, or in general if the time needed for Textract exceeds +# the 15-minute limit of Lambda. Also, no need to leave a Lambda function running +# while it actively polls Textract when Textract is perfectly happy notifying us on +# SNS when it's ready for us to come back :-) + +def handler(event, context): + start_job = None + job_type = os.getenv('COMPREHEND_MEDICAL_JOB_TYPE') + for job in cm_job_types: + if job_type == job[0]: + start_job = getattr(cm_client, job[1]) + if not start_job: + logger.info('There is no valid COMPREHEND_MEDICAL_JOB_TYPE set.') + return + + try: + if event.get('textract_result'): + output_json = event['textract_result']['TextractOutputJsonPath'] + bucket = urlparse(output_json).hostname + object_key = urlparse(output_json).path[1:] + logger.debug(f'Bucket: {bucket}') + logger.debug(f'Key: {object_key}') + resp = client.get_object(Bucket=bucket, Key=object_key) + blocks = json.loads(resp['Body'].read()) + document = trp.Document(blocks) + logger.info(f'The document has {len(document.pages)} pages') + # TODO We can add Bedrock here to send multiple pages to CM based on the context + text_content = "" + job_name = f'job-{uuid.uuid4()}' + object_name = f'textract-output/text/{job_type}/{job_name}/{job_name}.txt' + for page in document.pages: + text_content += page.text + client.put_object(Bucket=bucket, Key=object_name, Body=str.encode(text_content)) + start_job( + InputDataConfig={ + 'S3Bucket': bucket, + 'S3Key': f'textract-output/text/{job_type}/{job_name}' + }, + OutputDataConfig={ + 'S3Bucket': bucket, + 'S3Key': f'cm-output/json/{job_type}/{job_name}' + }, + JobName=job_name, + DataAccessRoleArn=os.getenv('COMPREHEND_MEDICAL_ROLE'), + LanguageCode='en' + ) + else: + raise RuntimeError('Invalid lambda event.') + except Exception as e: + traceback.print_exc() + raise e \ No newline at end of file diff --git a/lambda/textract_comprehend_medical/app/requirements.txt b/lambda/textract_comprehend_medical/app/requirements.txt new file mode 100644 index 0000000..9128fc3 --- /dev/null +++ b/lambda/textract_comprehend_medical/app/requirements.txt @@ -0,0 +1,2 @@ +requests +requests_auth_aws_sigv4 diff --git a/lambda/textract_comprehend_medical/app/trp.py b/lambda/textract_comprehend_medical/app/trp.py new file mode 100644 index 0000000..ee7e8cc --- /dev/null +++ b/lambda/textract_comprehend_medical/app/trp.py @@ -0,0 +1,700 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +class BoundingBox: + def __init__(self, width, height, left, top): + self._width = width + self._height = height + self._left = left + self._top = top + + def __str__(self): + return "width: {}, height: {}, left: {}, top: {}".format(self._width, self._height, self._left, self._top) + + @property + def width(self): + return self._width + + @property + def height(self): + return self._height + + @property + def left(self): + return self._left + + @property + def top(self): + return self._top + + +class Polygon: + def __init__(self, x, y): + self._x = x + self._y = y + + def __str__(self): + return "x: {}, y: {}".format(self._x, self._y) + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + +class Geometry: + def __init__(self, geometry): + bounding_box = geometry["BoundingBox"] + polygon = geometry["Polygon"] + bb = BoundingBox(bounding_box["Width"], bounding_box["Height"], bounding_box["Left"], bounding_box["Top"]) + pgs = [] + for pg in polygon: + pgs.append(Polygon(pg["X"], pg["Y"])) + + self._bounding_box = bb + self._polygon = pgs + + def __str__(self): + s = "BoundingBox: {}\n".format(str(self._bounding_box)) + return s + + @property + def bounding_box(self): + return self._bounding_box + + @property + def polygon(self): + return self._polygon + + +class Word: + def __init__(self, block, block_map): + self._block = block + self._confidence = block['Confidence'] + self._geometry = Geometry(block['Geometry']) + self._id = block['Id'] + self._text = "" + if block['Text']: + self._text = block['Text'] + + def __str__(self): + return self._text + + @property + def confidence(self): + return self._confidence + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def text(self): + return self._text + + @property + def block(self): + return self._block + + +class Line: + def __init__(self, block, block_map): + + self._block = block + self._confidence = block['Confidence'] + self._geometry = Geometry(block['Geometry']) + self._id = block['Id'] + + self._text = "" + if block['Text']: + self._text = block['Text'] + + self._words = [] + if 'Relationships' in block and block['Relationships']: + for rs in block['Relationships']: + if rs['Type'] == 'CHILD': + for cid in rs['Ids']: + if cid not in block_map: + continue + if block_map[cid]["BlockType"] == "WORD": + self._words.append(Word(block_map[cid], block_map)) + + def __str__(self): + s = "Line\n==========\n" + s = s + self._text + "\n" + s = s + "Words\n----------\n" + for word in self._words: + s = s + "[{}]".format(str(word)) + return s + + @property + def confidence(self): + return self._confidence + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def words(self): + return self._words + + @property + def text(self): + return self._text + + @property + def block(self): + return self._block + + +class SelectionElement: + def __init__(self, block, block_map): + self._confidence = block['Confidence'] + self._geometry = Geometry(block['Geometry']) + self._id = block['Id'] + self._selectionStatus = block['SelectionStatus'] + + @property + def confidence(self): + return self._confidence + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def selection_status(self): + return self._selectionStatus + + +class FieldKey: + def __init__(self, block, children, block_map): + self._block = block + self._confidence = block['Confidence'] + self._geometry = Geometry(block['Geometry']) + self._id = block['Id'] + self._text = "" + self._content = [] + + t = [] + + for eid in children: + wb = block_map[eid] + if wb['BlockType'] == "WORD": + w = Word(wb, block_map) + self._content.append(w) + t.append(w.text) + + if t: + self._text = ' '.join(t) + + def __str__(self): + return self._text + + @property + def confidence(self): + return self._confidence + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def content(self): + return self._content + + @property + def text(self): + return self._text + + @property + def block(self): + return self._block + + +class FieldValue: + def __init__(self, block, children, block_map): + self._block = block + self._confidence = block['Confidence'] + self._geometry = Geometry(block['Geometry']) + self._id = block['Id'] + self._text = "" + self._content = [] + + t = [] + + for eid in children: + wb = block_map[eid] + if wb['BlockType'] == "WORD": + w = Word(wb, block_map) + self._content.append(w) + t.append(w.text) + elif wb['BlockType'] == "SELECTION_ELEMENT": + se = SelectionElement(wb, block_map) + self._content.append(se) + self._text = se.selection_status + + if t: + self._text = ' '.join(t) + + def __str__(self): + return self._text + + @property + def confidence(self): + return self._confidence + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def content(self): + return self._content + + @property + def text(self): + return self._text + + @property + def block(self): + return self._block + + +class Field: + def __init__(self, block, block_map): + self._key = None + self._value = None + + for item in block['Relationships']: + if item["Type"] == "CHILD": + self._key = FieldKey(block, item['Ids'], block_map) + elif item["Type"] == "VALUE": + for eid in item['Ids']: + vkvs = block_map[eid] + if 'VALUE' in vkvs['EntityTypes']: + if 'Relationships' in vkvs: + for v_item in vkvs['Relationships']: + if v_item["Type"] == "CHILD": + self._value = FieldValue(vkvs, v_item['Ids'], block_map) + + def __str__(self): + s = "\nField\n==========\n" + k = "" + v = "" + if self._key: + k = str(self._key) + if self._value: + v = str(self._value) + s = s + "Key: {}\nValue: {}".format(k, v) + return s + + @property + def key(self): + return self._key + + @property + def value(self): + return self._value + + +class Form: + def __init__(self): + self._fields = [] + self._fields_map = {} + + def add_field(self, field): + self._fields.append(field) + self._fields_map[field.key.text] = field + + def __str__(self): + s = "" + for field in self._fields: + s = s + str(field) + "\n" + return s + + @property + def fields(self): + return self._fields + + def get_field_by_key(self, key): + field = None + if key in self._fields_map: + field = self._fields_map[key] + return field + + def search_fields_by_key(self, key): + search_key = key.lower() + results = [] + for field in self._fields: + if field.key and search_key in field.key.text.lower(): + results.append(field) + return results + + +class Cell: + + def __init__(self, block, block_map): + self._block = block + self._confidence = block['Confidence'] + self._rowIndex = block['RowIndex'] + self._columnIndex = block['ColumnIndex'] + self._rowSpan = block['RowSpan'] + self._columnSpan = block['ColumnSpan'] + self._geometry = Geometry(block['Geometry']) + self._id = block['Id'] + self._content = [] + self._text = "" + if 'Relationships' in block and block['Relationships']: + for rs in block['Relationships']: + if rs['Type'] == 'CHILD': + for cid in rs['Ids']: + block_type = block_map[cid]["BlockType"] + if block_type == "WORD": + w = Word(block_map[cid], block_map) + self._content.append(w) + self._text = self._text + w.text + ' ' + elif block_type == "SELECTION_ELEMENT": + se = SelectionElement(block_map[cid], block_map) + self._content.append(se) + self._text = self._text + se.selection_status + ', ' + + def __str__(self): + return self._text + + @property + def confidence(self): + return self._confidence + + @property + def row_index(self): + return self._rowIndex + + @property + def column_index(self): + return self._columnIndex + + @property + def row_span(self): + return self._rowSpan + + @property + def column_span(self): + return self._columnSpan + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def content(self): + return self._content + + @property + def text(self): + return self._text + + @property + def block(self): + return self._block + + +class Row: + def __init__(self): + self._cells = [] + + def __str__(self): + s = "" + for cell in self._cells: + s = s + "[{}]".format(str(cell)) + return s + + @property + def cells(self): + return self._cells + + +class Table: + + def __init__(self, block, block_map): + + self._block = block + + self._confidence = block['Confidence'] + self._geometry = Geometry(block['Geometry']) + + self._id = block['Id'] + self._rows = [] + + ri = 1 + row = Row() + if 'Relationships' in block and block['Relationships']: + for rs in block['Relationships']: + if rs['Type'] == 'CHILD': + for cid in rs['Ids']: + cell = Cell(block_map[cid], block_map) + if cell.row_index > ri: + self._rows.append(row) + row = Row() + ri = cell.row_index + row.cells.append(cell) + if row and row.cells: + self._rows.append(row) + + def __str__(self): + s = "Table\n==========\n" + for row in self._rows: + s = s + "Row\n==========\n" + s = s + str(row) + "\n" + return s + + @property + def confidence(self): + return self._confidence + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def rows(self): + return self._rows + + @property + def block(self): + return self._block + + +class Page: + + def __init__(self, blocks, block_map): + self._blocks = blocks + self._text = "" + self._lines = [] + self._form = Form() + self._tables = [] + self._selection_element = [] + self._content = [] + self._orientation = True + + self._parse(block_map) + + def __str__(self): + s = "Page\n==========\n" + for item in self._content: + s = s + str(item) + "\n" + return s + + def _parse(self, block_map): + for item in self._blocks: + if item["BlockType"] == "PAGE": + self._geometry = Geometry(item['Geometry']) + self._id = item['Id'] + self._orientation = item['Orientation'] + elif item["BlockType"] == "LINE": + line_item = Line(item, block_map) + self._lines.append(line_item) + self._content.append(line_item) + self._text = self._text + line_item.text + '\n' + elif item["BlockType"] == "TABLE": + table_item = Table(item, block_map) + self._tables.append(table_item) + self._content.append(table_item) + elif item["BlockType"] == "SELECTION_ELEMENT": + selection_item = SelectionElement(item, block_map) + self._selection_element.append(selection_item) + self._content.append(selection_item) + elif item["BlockType"] == "KEY_VALUE_SET": + if 'KEY' in item['EntityTypes']: + f = Field(item, block_map) + if f.key: + self._form.add_field(f) + self._content.append(f) + else: + print("WARNING: Detected K/V where key does not have content. Excluding key from output.") + print(f) + print(item) + + def get_lines_in_reading_order(self): + columns = [] + lines = [] + for item in self._lines: + column_found = False + for index, column in enumerate(columns): + bbox_left = item.geometry.bounding_box.left + bbox_right = item.geometry.bounding_box.left + item.geometry.bounding_box.width + bbox_centre = item.geometry.bounding_box.left + item.geometry.bounding_box.width / 2 + column_centre = column['left'] + column['right'] / 2 + if column['left'] < bbox_centre < column['right'] or bbox_left < column_centre < bbox_right: + # Bbox appears inside the column + lines.append([index, item]) + column_found = True + break + if not column_found: + columns.append({'left': item.geometry.bounding_box.left, + 'right': item.geometry.bounding_box.left + item.geometry.bounding_box.width}) + lines.append([len(columns) - 1, item]) + + lines.sort(key=lambda x: x[0]) + return lines + + def get_text_in_reading_order(self): + lines = self.get_lines_in_reading_order() + text = "" + for line in lines: + text = text + line[1] + '\n' + return text + + @property + def blocks(self): + return self._blocks + + @property + def text(self): + return self._text + + @property + def lines(self): + return self._lines + + @property + def form(self): + return self._form + + @property + def tables(self): + return self._tables + + @property + def selections(self): + return self._selection_element + + @property + def content(self): + return self._content + + @property + def geometry(self): + return self._geometry + + @property + def id(self): + return self._id + + @property + def orientation(self): + return self._orientation + + +class Document: + + def __init__(self, response_pages): + + if not isinstance(response_pages, list): + rps = [response_pages] + response_pages = rps + + self._responsePages = response_pages + self._pages = [] + + self._parse() + + def __str__(self): + s = "\nDocument\n==========\n" + for p in self._pages: + s = s + str(p) + "\n\n" + return s + + def _parse_document_pages_and_block_map(self): + + block_map = {} + + document_pages = [] + document_page = None + for page in self._responsePages: + for block in page['Blocks']: + if 'BlockType' in block and 'Id' in block: + block_map[block['Id']] = block + + if block['BlockType'] == 'PAGE': + if block['Geometry']['Polygon'][0]['X'] > block['Geometry']['Polygon'][1]['X']: + block['Orientation'] = False + else: + block['Orientation'] = True + if document_page: + document_pages.append({"Blocks": document_page}) + document_page = [block] + else: + if document_page: + document_page.append(block) + if document_page: + document_pages.append({"Blocks": document_page}) + return document_pages, block_map + + def _parse(self): + + self._response_document_pages, self._block_map = self._parse_document_pages_and_block_map() + for documentPage in self._response_document_pages: + page = Page(documentPage["Blocks"], self._block_map) + self._pages.append(page) + + @property + def blocks(self): + return self._responsePages + + @property + def page_blocks(self): + return self._response_document_pages + + @property + def pages(self): + return self._pages + + def get_block_by_id(self, block_id): + block = None + if self._block_map and block_id in self._block_map: + block = self._block_map[block_id] + return block diff --git a/lambda/textract_comprehend_medical/env.json b/lambda/textract_comprehend_medical/env.json new file mode 100644 index 0000000..ad1bb75 --- /dev/null +++ b/lambda/textract_comprehend_medical/env.json @@ -0,0 +1,7 @@ +{ + "Parameters": { + "SNS_ARN": "arn:aws:sns:us-east-2:123456789012:Textract", + "ROLE_ARN": "arn:aws:iam::123456789012:role/Role_Textract", + "LOG_LEVEL": "DEBUG" + } +} diff --git a/lambda/textract_comprehend_medical/events/event.json b/lambda/textract_comprehend_medical/events/event.json new file mode 100644 index 0000000..c86f84b --- /dev/null +++ b/lambda/textract_comprehend_medical/events/event.json @@ -0,0 +1,14 @@ +{ + "manifest": + { + "s3Path": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/uploads/fax-190517121115-6784593217-11_Redacted.pdf" + }, + "mime": "application/pdf", + "classification": null, + "numberOfPages": 2, + "textract_result": + { + "TextractTempOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/temp/a8cdb100b8414882978cef68cf49c7ed1e47a68680ffa9fd442a69ae8c05bf4c", + "TextractOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/textract-output/fax-190517121115-6784593217-11_Redacted2023-04-19T18:49:29.225383/fax-190517121115-6784593217-11_Redacted.json" + } +} \ No newline at end of file diff --git a/lambda/textract_comprehend_medical/template.yaml b/lambda/textract_comprehend_medical/template.yaml new file mode 100644 index 0000000..496fd45 --- /dev/null +++ b/lambda/textract_comprehend_medical/template.yaml @@ -0,0 +1,27 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + python3.9 + + Lambda function for PDF Mapper for FHIR + +Globals: + Function: + Timeout: 900 + +Resources: + PdfMapperForFhirFunction: + Type: AWS::Serverless::Function + Properties: + PackageType: Image + Architectures: + - x86_64 + Environment: + Variables: + HEALTHLAKE_ENDPOINT: https://healthlake.us-east-2.amazonaws.com/datastore/83c10afe8566667ac2489c8d989b2c14/r4/ + LOG_LEVEL: INFO + Metadata: + Dockerfile: Dockerfile + DockerContext: . + DockerTag: python3.9-v1 + diff --git a/lambda/textract_comprehend_medical/test_sam_local.sh b/lambda/textract_comprehend_medical/test_sam_local.sh new file mode 100755 index 0000000..4c4e983 --- /dev/null +++ b/lambda/textract_comprehend_medical/test_sam_local.sh @@ -0,0 +1,2 @@ +sam build +sam local invoke -n env.json -e events/event.json diff --git a/lambda/textract_comprehend_medical/tests/data/sample_manifest.json b/lambda/textract_comprehend_medical/tests/data/sample_manifest.json new file mode 100644 index 0000000..7dc98a5 --- /dev/null +++ b/lambda/textract_comprehend_medical/tests/data/sample_manifest.json @@ -0,0 +1,14 @@ +{ + "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", + "textractFeatures": [ + "FORMS", + "TABLES", + "QUERIES" + ], + "queriesConfig": [{ + "text": "What is the applicant full name?", + "alias": "FULL_NAME", + "pages": "[*]" + }], + "classification": "APPLICATION", +} diff --git a/lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json b/lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json new file mode 100644 index 0000000..61b5e4c --- /dev/null +++ b/lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json @@ -0,0 +1,13 @@ +{ + "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", + "textractFeatures": [ + "FORMS", + "TABLES", + "QUERIES" + ], + "queries_config": [{ + "text": "What is the applicant full name?", + "alias": "FULL_NAME", + "pages": "[*]" + }] +} diff --git a/lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py b/lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py new file mode 100644 index 0000000..1977b4f --- /dev/null +++ b/lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py @@ -0,0 +1,19 @@ +import json +import io +import os +import boto3 + +current_folder = os.path.dirname(os.path.realpath(__file__)) + + +def test_serializer_manifest(caplog): + s3_bucket = 'sdx-textract-us-east-1' + s3_key = 'sample_manifest.json' + s3_client = boto3.client('s3') + o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) + file_content = o.get('Body').read().decode('utf-8') + json_content = json.loads(file_content) + assert json_content + assert json_content['s3Path'] + assert json_content['textractFeatures'] + assert len(json_content['textractFeatures']) == 3 diff --git a/src/index.ts b/src/index.ts index c3cca88..fd4fe03 100644 --- a/src/index.ts +++ b/src/index.ts @@ -26,6 +26,7 @@ export { WorkmailS3IngestionPoint, WorkmailS3IngestionPointProps } from './workm export { TextractPdfMapperForFhir, TextractPdfMapperForFhirProps } from './textractPdfMapperForFhir'; export { SearchablePDF, SearchablePDFProps } from './searchablePDF'; export { SFExecutionsStartThrottle, SFExecutionsStartThrottleProps as sfExecutionsStartThrottleProps } from './stepFunctionsExecutionsStartThrottle'; +export { TextractComprehendMedical, TextractComprehendMedicalProps } from './textractComprehendMedical'; export function validatePatternSupported(integrationPattern: sfn.IntegrationPattern, supportedPatterns: sfn.IntegrationPattern[]) { if (! supportedPatterns.includes(integrationPattern)) { diff --git a/src/textractComprehendMedical.ts b/src/textractComprehendMedical.ts new file mode 100644 index 0000000..a598b9c --- /dev/null +++ b/src/textractComprehendMedical.ts @@ -0,0 +1,150 @@ +import * as path from 'path'; +import { Duration } from 'aws-cdk-lib'; +import * as iam from 'aws-cdk-lib/aws-iam'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; +import * as tasks from 'aws-cdk-lib/aws-stepfunctions-tasks'; +import { Construct } from 'constructs'; + +export interface TextractComprehendMedicalProps { + /** memory of Lambda function (may need to increase for larger documents) */ + readonly lambdaMemoryMB?: number; + readonly lambdaTimeout?: number; + readonly textractComprehendMedicalFunction?: lambda.IFunction; + readonly lambdaLogLevel?: string; + readonly s3InputBucket?: string; + readonly comprehendMedicalRoleName?: string; + readonly comprehendMedicalJobType?: string; + /** prefix for the incoming document. Will be used to create role */ + readonly s3InputPrefix?: string; + /** List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. */ + readonly inputPolicyStatements?: iam.PolicyStatement[]; +} + +/** + * This construct takes in a manifest definition or a plain JSON with a s3Path: + * + * example s3Path: + * {"s3Path": "s3://bucketname/prefix/image.png"} + * + * + * Then it generated the numberOfPages attribute and the mime on the context. + * The mime types checked against the supported mime types for Textract and if fails, will raise an Exception failing the workflow. + * + * Example (Python) + * ```python + decider_task_id = tcdk.TextractPOCDecider( + self, + f"InsuranceDecider", + ) + ``` + + * + */ +export class TextractComprehendMedical extends sfn.StateMachineFragment { + public readonly startState: sfn.State; + public readonly endStates: sfn.INextable[]; + public readonly textractComprehendMedicalFunction: lambda.IFunction; + + constructor(parent: Construct, id: string, props: TextractComprehendMedicalProps) { + super(parent, id); + + const lambdaMemoryMB = props.lambdaMemoryMB === undefined ? 1024 : props.lambdaMemoryMB; + const lambdaTimeout = props.lambdaTimeout === undefined ? 900 : props.lambdaTimeout; + const lambdaLogLevel = props.lambdaLogLevel === undefined ? 'INFO' : props.lambdaLogLevel; + const s3InputPrefix = props.s3InputPrefix === undefined ? 'uploads' : props.s3InputPrefix; + const cmJobType = props.comprehendMedicalJobType === undefined ? 'ICD10' : props.comprehendMedicalJobType; + + const comprehendMedicalRole = new iam.Role(this, 'RoleComprehendMedical', { + assumedBy: new iam.ServicePrincipal('comprehendmedical.amazonaws.com'), + }); + if (props.s3InputBucket === undefined) { + comprehendMedicalRole.addToPolicy( + new iam.PolicyStatement({ + actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'], + effect: iam.Effect.ALLOW, + resources: ['*'], + }), + ); + } else { + comprehendMedicalRole.addToPolicy( + new iam.PolicyStatement({ + actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'], + effect: iam.Effect.ALLOW, + resources: [ + path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/'), + path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/*'), + ], + }), + ); + } + + this.textractComprehendMedicalFunction = new lambda.DockerImageFunction( + this, + 'TextractComprehendMedical', + { + code: lambda.DockerImageCode.fromImageAsset( + path.join(__dirname, '../lambda/textract_comprehend_medical/'), + ), + architecture: lambda.Architecture.X86_64, + memorySize: lambdaMemoryMB, + timeout: Duration.seconds(lambdaTimeout), + environment: { + LOG_LEVEL: lambdaLogLevel, + COMPREHEND_MEDICAL_ROLE: comprehendMedicalRole.roleArn, + COMPREHEND_MEDICAL_JOB_TYPE: cmJobType, + }, + }, + ); + + if (props.inputPolicyStatements === undefined) { + if (props.s3InputBucket === undefined) { + this.textractComprehendMedicalFunction.addToRolePolicy( + new iam.PolicyStatement({ + actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'], + resources: ['*'], + }), + ); + } else { + this.textractComprehendMedicalFunction.addToRolePolicy( + new iam.PolicyStatement({ + actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'], + resources: [ + path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/'), + path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/*'), + ], + }), + ); + } + this.textractComprehendMedicalFunction.addToRolePolicy( + new iam.PolicyStatement({ + actions: ['comprehendmedical:Start*'], + resources: ['*'], + }), + ), + this.textractComprehendMedicalFunction.addToRolePolicy( + new iam.PolicyStatement({ + actions: ['iam:PassRole'], + resources: ['*'], + conditions: { + StringEquals: { + 'iam:PassedToService': 'comprehendmedical.amazonaws.com', + }, + }, + }, + ), + ); + } else { + for (var policyStatement of props.inputPolicyStatements) { + this.textractComprehendMedicalFunction.addToRolePolicy(policyStatement); + } + } + + const textractComprehendMedicalLambdaInvoke = new tasks.LambdaInvoke(this, id, { + lambdaFunction: this.textractComprehendMedicalFunction, + outputPath: '$.Payload', + }); + this.startState = textractComprehendMedicalLambdaInvoke; + this.endStates = [textractComprehendMedicalLambdaInvoke]; + } +}