#!/usr/bin/env python # coding: utf-8 # ## Table of Contents # # 1. [Initial Setup](#chapter1) # * [1.1. Install pdf2docx](#section_1_1) # * [1.2. Create an S3 bucket in the same region](#section_1_2) # * [1.3. Create Amazon Translate Batch Service Policy](#section_1_3) # * [1.4. Create Amazon Translate Batch Service Role](#section_1_4) # * [1.5. Attach the policy to the Service Role](#section_1_5) # # ---- # # 2. [Upload multiple files to S3](#chapter2) # * [2.1. Widget to upload multiple files](#section_2_1) # * [2.2. Write to S3 bucket](#section_2_2) # # ---- # # 3. [Translate Japanese documents to English using Batch Translation](#chapter3) # * [3.1. Create and start the batch translation job](#section_3_1) # * [3.2. Check the status of the job](#section_3_2) # # ---- # # 4. [Verify and clean up](#chapter4) # * [4.1. Verify the document created in S3](#section_4_1) # * [4.2. Clean Up (optional)](#section_4_2) # # ---- # ### 1. Initial Setup # Run this section to install any libraries necessary and and IAM policy or roles needed as a pre-requisite # #### 1.1 Install pdf2docx # Install the library [pdf2docx](https://pdf2docx.com/) to convert pdf to docx as [Amazon Translate](https://aws.amazon.com/translate/) do not currently support pdf formats. # In[ ]: get_ipython().system('pip3 install pdf2docx') # #### 1.2 Create an S3 bucket in the same region # _For example since this focusses on Japanaese to English Translation we can name the prefixes accordingly.:-_ # # _Choose a unique bucket name_ # # bucket_name='translate-ja-en-kunal' # # in_prefix_name='Japanese/input' # # **Enter a unique bucket name before running the below cell** # In[ ]: import boto3 from pprint import pprint # Enter the unique S3 bucket name before running bucket_name='translate-ja-en-kunal' my_region = boto3.session.Session().region_name s3_client = boto3.client('s3', region_name=my_region) location = {'LocationConstraint': my_region} response=s3_client.create_bucket(Bucket=bucket_name,CreateBucketConfiguration=location) pprint(response) # #### 1.3 Create Amazon Translate Batch Service Policy # _Enter the bucket name created above, Policy Name_ # _For example:-_ # # bucket_name='translate-ja-en-kunal' # # PolicyName='AmazonTranslateServicePolicy-Japanese-English-Document-Translation' # # Description='Amazon Translate service role policy for Batch' # In[ ]: import json client = boto3.client('iam') # You may use the same Policy Name as long as it is not taken in your account policy_name='AmazonTranslateServicePolicy-Japanese-English-Document-Translation' policy_desc='Amazon Translate service role policy for Batch' policy_document={ "Version": "2012-10-17", "Statement": [ { "Action": [ "s3:GetObject" ], "Resource": [ "arn:aws:s3:::" + bucket_name + "/*", "arn:aws:s3:::" + bucket_name + "/*" ], "Effect": "Allow" }, { "Action": [ "s3:ListBucket" ], "Resource": [ "arn:aws:s3:::" + bucket_name, "arn:aws:s3:::" + bucket_name ], "Effect": "Allow" }, { "Action": [ "s3:PutObject" ], "Resource": [ "arn:aws:s3:::" + bucket_name + "/*" ], "Effect": "Allow" } ] } response = client.create_policy( PolicyName=policy_name, PolicyDocument=json.dumps(policy_document), Description=policy_desc ) policy_response=response policy_arn=policy_response['Policy']['Arn'] print("Bucket Name",bucket_name) print("Policy Name:",policy_name) print("Policy Arn:",policy_arn) # #### 1.4 Create Amazon Translate Batch Service Role # _Enter a role name and description_ # _For example:-_ # # RoleName='AmazonTranslateServiceRole-Japanese-English-Document-Translation' # # Description='Amazon Translate service role for Batch.' # In[ ]: import boto3 import json client = boto3.client('iam') # You may use the same Policy Name as long as it is not taken in your account role_name='AmazonTranslateServiceRole-Japanese-English-Document-Translation' role_desc='Amazon Translate service role for Batch.' trust_relationship_policy={ "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": "translate.amazonaws.com" }, "Action": "sts:AssumeRole" } ] } response = client.create_role( Path='/service-role/', RoleName=role_name, AssumeRolePolicyDocument=json.dumps(trust_relationship_policy), Description=role_desc ) role_response=response role_arn=role_response['Role']['Arn'] print("Role Name:",role_name) print("Role Arn:",role_arn) # #### 1.5 Attach the policy to the Service Role # In[ ]: # Attach a role policy client.attach_role_policy( PolicyArn=policy_arn, RoleName=role_name ) # ---- # ### 2. Upload multiple files to S3 # # Upload multiple Japanese documents to be translated from desktop. # Accepted formats are _docx_, _pdf_ # #### 2.1 Widget to upload multiples # # Accepted formats are _docx_, _pdf_ # In[ ]: # Create the upload widget to upload the file from local # Click to upload files (docx / pdf) from ipywidgets import FileUpload from IPython.display import display upload = FileUpload(accept='.docx,.pdf', multiple=True) display(upload) # #### 2.2 Write to S3 bucket # # * docx will be written to S3 # * pdf will be converted to docx before writing # In[ ]: from pdf2docx import parse import os # Translation input and out file prefix in S3 in_prefix_name='Japanese/input' out_prefix_name='Japanese/output' s3 = boto3.resource('s3', region_name=my_region) for name, md in upload.value.items(): # If the file type is pdf, convert to docx if md['metadata']['type'] == 'application/pdf': with open (name, 'wb') as file: file.write(md['content']) filename, file_extension = os.path.splitext(name) newfilename = filename + '.docx' parse(name, newfilename, start=0, end=None) s3.Bucket(bucket_name).upload_file(newfilename,os.path.join(in_prefix_name,newfilename)) os.remove(name) os.remove(newfilename) else: with open (name, 'wb') as file: s3.Object(bucket_name, os.path.join(in_prefix_name,name)).put(Body=md['content']) # ---- # #### 3.1 Create and start the batch translation job # In[ ]: from datetime import datetime client = boto3.client('translate') now=datetime.now().strftime("%m%d%Y%H%M%S") job_name='japanese-to-english-multi-pages' + '-' + now content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document' job_response = client.start_text_translation_job( JobName=job_name, InputDataConfig={ 'S3Uri': os.path.join('s3://',bucket_name,in_prefix_name), 'ContentType': content_type }, OutputDataConfig={ 'S3Uri': os.path.join('s3://',bucket_name,out_prefix_name) }, DataAccessRoleArn=role_arn, SourceLanguageCode='ja', TargetLanguageCodes=[ 'en', ] ) job_id=job_response['JobId'] job_status=job_response['JobStatus'] print("JobId",job_id) print("JobStatus",job_status) print("Job Name",job_name) pprint(job_response) # #### 3.2 Check the status of the job # # Keep checking on the JobStatus which will change from **SUBMITTED** --> **IN_PROGRESS** --> **COMPLETED** # In[ ]: # Get job status status_response = client.describe_text_translation_job( JobId=job_id ) job_status=status_response['TextTranslationJobProperties']['JobStatus'] print("Job Name",job_name) print("Job Status",job_status) pprint(status_response) # ---- # ### 4. Verify and clean up # # Verify the translated document created in S3 and then clean up resources (optional). # #### 4.1 Verify the document created in s3 # # Verify the translated document created in s3 location # In[ ]: print(os.path.join('s3://',bucket_name,out_prefix_name)) # #### 4.2 Clean up (optional) # # Clean up the resources created after you are done. # In[ ]: print("Reminder : Following are the resources which you created in this Notebook which needs to be cleaned up after you are done in region,{}.".format(my_region)) print("Bucket Name",bucket_name) print("Policy Name:",policy_name) print("Policy Arn:",policy_arn) print("Role Name:",role_name) print("Role Arn:",role_arn) # ##### All Done!