#!/usr/bin/env python
# coding: utf-8
# ## Table of Contents
#
# 1. [Initial Setup](#chapter1)
# * [1.1. Install pdf2docx](#section_1_1)
# * [1.2. Create an S3 bucket in the same region](#section_1_2)
# * [1.3. Create Amazon Translate Batch Service Policy](#section_1_3)
# * [1.4. Create Amazon Translate Batch Service Role](#section_1_4)
# * [1.5. Attach the policy to the Service Role](#section_1_5)
#
# ----
#
# 2. [Upload multiple files to S3](#chapter2)
# * [2.1. Widget to upload multiple files](#section_2_1)
# * [2.2. Write to S3 bucket](#section_2_2)
#
# ----
#
# 3. [Translate Japanese documents to English using Batch Translation](#chapter3)
# * [3.1. Create and start the batch translation job](#section_3_1)
# * [3.2. Check the status of the job](#section_3_2)
#
# ----
#
# 4. [Verify and clean up](#chapter4)
# * [4.1. Verify the document created in S3](#section_4_1)
# * [4.2. Clean Up (optional)](#section_4_2)
#
# ----
# ### 1. Initial Setup
# Run this section to install any libraries necessary and and IAM policy or roles needed as a pre-requisite
# #### 1.1 Install pdf2docx
# Install the library [pdf2docx](https://pdf2docx.com/) to convert pdf to docx as [Amazon Translate](https://aws.amazon.com/translate/) do not currently support pdf formats.
# In[ ]:
get_ipython().system('pip3 install pdf2docx')
# #### 1.2 Create an S3 bucket in the same region
# _For example since this focusses on Japanaese to English Translation we can name the prefixes accordingly.:-_
#
# _Choose a unique bucket name_
#
# bucket_name='translate-ja-en-kunal'
#
# in_prefix_name='Japanese/input'
#
# **Enter a unique bucket name before running the below cell**
# In[ ]:
import boto3
from pprint import pprint
# Enter the unique S3 bucket name before running
bucket_name='translate-ja-en-kunal'
my_region = boto3.session.Session().region_name
s3_client = boto3.client('s3', region_name=my_region)
location = {'LocationConstraint': my_region}
response=s3_client.create_bucket(Bucket=bucket_name,CreateBucketConfiguration=location)
pprint(response)
# #### 1.3 Create Amazon Translate Batch Service Policy
# _Enter the bucket name created above, Policy Name_
# _For example:-_
#
# bucket_name='translate-ja-en-kunal'
#
# PolicyName='AmazonTranslateServicePolicy-Japanese-English-Document-Translation'
#
# Description='Amazon Translate service role policy for Batch'
# In[ ]:
import json
client = boto3.client('iam')
# You may use the same Policy Name as long as it is not taken in your account
policy_name='AmazonTranslateServicePolicy-Japanese-English-Document-Translation'
policy_desc='Amazon Translate service role policy for Batch'
policy_document={
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::" + bucket_name + "/*",
"arn:aws:s3:::" + bucket_name + "/*"
],
"Effect": "Allow"
},
{
"Action": [
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::" + bucket_name,
"arn:aws:s3:::" + bucket_name
],
"Effect": "Allow"
},
{
"Action": [
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::" + bucket_name + "/*"
],
"Effect": "Allow"
}
]
}
response = client.create_policy(
PolicyName=policy_name,
PolicyDocument=json.dumps(policy_document),
Description=policy_desc
)
policy_response=response
policy_arn=policy_response['Policy']['Arn']
print("Bucket Name",bucket_name)
print("Policy Name:",policy_name)
print("Policy Arn:",policy_arn)
# #### 1.4 Create Amazon Translate Batch Service Role
# _Enter a role name and description_
# _For example:-_
#
# RoleName='AmazonTranslateServiceRole-Japanese-English-Document-Translation'
#
# Description='Amazon Translate service role for Batch.'
# In[ ]:
import boto3
import json
client = boto3.client('iam')
# You may use the same Policy Name as long as it is not taken in your account
role_name='AmazonTranslateServiceRole-Japanese-English-Document-Translation'
role_desc='Amazon Translate service role for Batch.'
trust_relationship_policy={
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "translate.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
response = client.create_role(
Path='/service-role/',
RoleName=role_name,
AssumeRolePolicyDocument=json.dumps(trust_relationship_policy),
Description=role_desc
)
role_response=response
role_arn=role_response['Role']['Arn']
print("Role Name:",role_name)
print("Role Arn:",role_arn)
# #### 1.5 Attach the policy to the Service Role
# In[ ]:
# Attach a role policy
client.attach_role_policy(
PolicyArn=policy_arn,
RoleName=role_name
)
# ----
# ### 2. Upload multiple files to S3
#
# Upload multiple Japanese documents to be translated from desktop.
# Accepted formats are _docx_, _pdf_
# #### 2.1 Widget to upload multiples
#
# Accepted formats are _docx_, _pdf_
# In[ ]:
# Create the upload widget to upload the file from local
# Click to upload files (docx / pdf)
from ipywidgets import FileUpload
from IPython.display import display
upload = FileUpload(accept='.docx,.pdf', multiple=True)
display(upload)
# #### 2.2 Write to S3 bucket
#
# * docx will be written to S3
# * pdf will be converted to docx before writing
# In[ ]:
from pdf2docx import parse
import os
# Translation input and out file prefix in S3
in_prefix_name='Japanese/input'
out_prefix_name='Japanese/output'
s3 = boto3.resource('s3', region_name=my_region)
for name, md in upload.value.items():
# If the file type is pdf, convert to docx
if md['metadata']['type'] == 'application/pdf':
with open (name, 'wb') as file:
file.write(md['content'])
filename, file_extension = os.path.splitext(name)
newfilename = filename + '.docx'
parse(name, newfilename, start=0, end=None)
s3.Bucket(bucket_name).upload_file(newfilename,os.path.join(in_prefix_name,newfilename))
os.remove(name)
os.remove(newfilename)
else:
with open (name, 'wb') as file:
s3.Object(bucket_name, os.path.join(in_prefix_name,name)).put(Body=md['content'])
# ----
# #### 3.1 Create and start the batch translation job
# In[ ]:
from datetime import datetime
client = boto3.client('translate')
now=datetime.now().strftime("%m%d%Y%H%M%S")
job_name='japanese-to-english-multi-pages' + '-' + now
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
job_response = client.start_text_translation_job(
JobName=job_name,
InputDataConfig={
'S3Uri': os.path.join('s3://',bucket_name,in_prefix_name),
'ContentType': content_type
},
OutputDataConfig={
'S3Uri': os.path.join('s3://',bucket_name,out_prefix_name)
},
DataAccessRoleArn=role_arn,
SourceLanguageCode='ja',
TargetLanguageCodes=[
'en',
]
)
job_id=job_response['JobId']
job_status=job_response['JobStatus']
print("JobId",job_id)
print("JobStatus",job_status)
print("Job Name",job_name)
pprint(job_response)
# #### 3.2 Check the status of the job
#
# Keep checking on the JobStatus which will change from **SUBMITTED** --> **IN_PROGRESS** --> **COMPLETED**
# In[ ]:
# Get job status
status_response = client.describe_text_translation_job(
JobId=job_id
)
job_status=status_response['TextTranslationJobProperties']['JobStatus']
print("Job Name",job_name)
print("Job Status",job_status)
pprint(status_response)
# ----
# ### 4. Verify and clean up
#
# Verify the translated document created in S3 and then clean up resources (optional).
# #### 4.1 Verify the document created in s3
#
# Verify the translated document created in s3 location
# In[ ]:
print(os.path.join('s3://',bucket_name,out_prefix_name))
# #### 4.2 Clean up (optional)
#
# Clean up the resources created after you are done.
# In[ ]:
print("Reminder : Following are the resources which you created in this Notebook which needs to be cleaned up after you are done in region,{}.".format(my_region))
print("Bucket Name",bucket_name)
print("Policy Name:",policy_name)
print("Policy Arn:",policy_arn)
print("Role Name:",role_name)
print("Role Arn:",role_arn)
# ##### All Done!