Run this section to install any libraries necessary and and IAM policy or roles needed as a pre-requisite
Install the library pdf2docx to convert pdf to docx as Amazon Translate do not currently support pdf formats.
!pip3 install pdf2docx
For example since this focusses on Japanaese to English Translation we can name the prefixes accordingly.:-
Choose a unique bucket name
bucket_name='translate-ja-en-kunal'
in_prefix_name='Japanese/input'
Enter a unique bucket name before running the below cell
import boto3
from pprint import pprint
# Enter the unique S3 bucket name before running
bucket_name='translate-ja-en-kunal'
my_region = boto3.session.Session().region_name
s3_client = boto3.client('s3', region_name=my_region)
location = {'LocationConstraint': my_region}
response=s3_client.create_bucket(Bucket=bucket_name,CreateBucketConfiguration=location)
pprint(response)
Enter the bucket name created above, Policy Name For example:-
bucket_name='translate-ja-en-kunal'
PolicyName='AmazonTranslateServicePolicy-Japanese-English-Document-Translation'
Description='Amazon Translate service role policy for Batch'
import json
client = boto3.client('iam')
# You may use the same Policy Name as long as it is not taken in your account
policy_name='AmazonTranslateServicePolicy-Japanese-English-Document-Translation'
policy_desc='Amazon Translate service role policy for Batch'
policy_document={
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::" + bucket_name + "/*",
"arn:aws:s3:::" + bucket_name + "/*"
],
"Effect": "Allow"
},
{
"Action": [
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::" + bucket_name,
"arn:aws:s3:::" + bucket_name
],
"Effect": "Allow"
},
{
"Action": [
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::" + bucket_name + "/*"
],
"Effect": "Allow"
}
]
}
response = client.create_policy(
PolicyName=policy_name,
PolicyDocument=json.dumps(policy_document),
Description=policy_desc
)
policy_response=response
policy_arn=policy_response['Policy']['Arn']
print("Bucket Name",bucket_name)
print("Policy Name:",policy_name)
print("Policy Arn:",policy_arn)
Enter a role name and description For example:-
RoleName='AmazonTranslateServiceRole-Japanese-English-Document-Translation'
Description='Amazon Translate service role for Batch.'
import boto3
import json
client = boto3.client('iam')
# You may use the same Policy Name as long as it is not taken in your account
role_name='AmazonTranslateServiceRole-Japanese-English-Document-Translation'
role_desc='Amazon Translate service role for Batch.'
trust_relationship_policy={
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "translate.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
response = client.create_role(
Path='/service-role/',
RoleName=role_name,
AssumeRolePolicyDocument=json.dumps(trust_relationship_policy),
Description=role_desc
)
role_response=response
role_arn=role_response['Role']['Arn']
print("Role Name:",role_name)
print("Role Arn:",role_arn)
# Attach a role policy
client.attach_role_policy(
PolicyArn=policy_arn,
RoleName=role_name
)
Upload multiple Japanese documents to be translated from desktop. Accepted formats are docx, pdf
Accepted formats are docx, pdf
# Create the upload widget to upload the file from local
# Click to upload files (docx / pdf)
from ipywidgets import FileUpload
from IPython.display import display
upload = FileUpload(accept='.docx,.pdf', multiple=True)
display(upload)
from pdf2docx import parse
import os
# Translation input and out file prefix in S3
in_prefix_name='Japanese/input'
out_prefix_name='Japanese/output'
s3 = boto3.resource('s3', region_name=my_region)
for name, md in upload.value.items():
# If the file type is pdf, convert to docx
if md['metadata']['type'] == 'application/pdf':
with open (name, 'wb') as file:
file.write(md['content'])
filename, file_extension = os.path.splitext(name)
newfilename = filename + '.docx'
parse(name, newfilename, start=0, end=None)
s3.Bucket(bucket_name).upload_file(newfilename,os.path.join(in_prefix_name,newfilename))
os.remove(name)
os.remove(newfilename)
else:
with open (name, 'wb') as file:
s3.Object(bucket_name, os.path.join(in_prefix_name,name)).put(Body=md['content'])
from datetime import datetime
client = boto3.client('translate')
now=datetime.now().strftime("%m%d%Y%H%M%S")
job_name='japanese-to-english-multi-pages' + '-' + now
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
job_response = client.start_text_translation_job(
JobName=job_name,
InputDataConfig={
'S3Uri': os.path.join('s3://',bucket_name,in_prefix_name),
'ContentType': content_type
},
OutputDataConfig={
'S3Uri': os.path.join('s3://',bucket_name,out_prefix_name)
},
DataAccessRoleArn=role_arn,
SourceLanguageCode='ja',
TargetLanguageCodes=[
'en',
]
)
job_id=job_response['JobId']
job_status=job_response['JobStatus']
print("JobId",job_id)
print("JobStatus",job_status)
print("Job Name",job_name)
pprint(job_response)
Keep checking on the JobStatus which will change from SUBMITTED --> IN_PROGRESS --> COMPLETED
# Get job status
status_response = client.describe_text_translation_job(
JobId=job_id
)
job_status=status_response['TextTranslationJobProperties']['JobStatus']
print("Job Name",job_name)
print("Job Status",job_status)
pprint(status_response)
Verify the translated document created in S3 and then clean up resources (optional).
Verify the translated document created in s3 location
print(os.path.join('s3://',bucket_name,out_prefix_name))
Clean up the resources created after you are done.
print("Reminder : Following are the resources which you created in this Notebook which needs to be cleaned up after you are done in region,{}.".format(my_region))
print("Bucket Name",bucket_name)
print("Policy Name:",policy_name)
print("Policy Arn:",policy_arn)
print("Role Name:",role_name)
print("Role Arn:",role_arn)