#!/usr/bin/env python # coding: utf-8 # # Sentence Embeddings with Hugging Face Transformers, Sentence Transformers and Amazon SageMaker - Custom Inference for creating document embeddings with Hugging Face's Transformers # # Welcome to this getting started guide. We will use the Hugging Face Inference DLCs and Amazon SageMaker Python SDK to create a [real-time inference endpoint](https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints.html) running a Sentence Transformers for document embeddings. Currently, the [SageMaker Hugging Face Inference Toolkit](https://github.com/aws/sagemaker-huggingface-inference-toolkit) supports the [pipeline feature](https://huggingface.co/transformers/main_classes/pipelines.html) from Transformers for zero-code deployment. This means you can run compatible Hugging Face Transformer models without providing pre- & post-processing code. Therefore we only need to provide an environment variable `HF_TASK` and `HF_MODEL_ID` when creating our endpoint and the Inference Toolkit will take care of it. This is a great feature if you are working with existing [pipelines](https://huggingface.co/transformers/main_classes/pipelines.html). # # If you want to run other tasks, such as creating document embeddings, you can the pre- and post-processing code yourself, via an `inference.py` script. The Hugging Face Inference Toolkit allows the user to override the default methods of the `HuggingFaceHandlerService`. # # The custom module can override the following methods: # # - `model_fn(model_dir)` overrides the default method for loading a model. The return value `model` will be used in the`predict_fn` for predictions. # - `model_dir` is the the path to your unzipped `model.tar.gz`. # - `input_fn(input_data, content_type)` overrides the default method for pre-processing. The return value `data` will be used in `predict_fn` for predictions. The inputs are: # - `input_data` is the raw body of your request. # - `content_type` is the content type from the request header. # - `predict_fn(processed_data, model)` overrides the default method for predictions. The return value `predictions` will be used in `output_fn`. # - `model` returned value from `model_fn` methond # - `processed_data` returned value from `input_fn` method # - `output_fn(prediction, accept)` overrides the default method for post-processing. The return value `result` will be the response to your request (e.g.`JSON`). The inputs are: # - `predictions` is the result from `predict_fn`. # - `accept` is the return accept type from the HTTP Request, e.g. `application/json`. # # In this example are we going to use Sentence Transformers to create sentence embeddings using a mean pooling layer on the raw representation. # # *NOTE: You can run this demo in Sagemaker Studio, your local machine, or Sagemaker Notebook Instances* # ## Development Environment and Permissions # # ### Installation # # In[ ]: get_ipython().run_line_magic('pip', 'install sagemaker --upgrade') # Install `git` and `git-lfs` # In[ ]: # For notebook instances (Amazon Linux) get_ipython().system('sudo yum update -y') get_ipython().system('curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | sudo bash') get_ipython().system('sudo yum install git-lfs git -y') # For other environments (Ubuntu) get_ipython().system('sudo apt-get update -y') get_ipython().system('curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash') get_ipython().system('sudo apt-get install git-lfs git -y') # ### Permissions # # _If you are going to use Sagemaker in a local environment (not SageMaker Studio or Notebook Instances). You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it._ # In[7]: import sagemaker import boto3 sess = sagemaker.Session() # sagemaker session bucket -> used for uploading data, models and logs # sagemaker will automatically create this bucket if it not exists sagemaker_session_bucket=None if sagemaker_session_bucket is None and sess is not None: # set to default bucket if a bucket name is not given sagemaker_session_bucket = sess.default_bucket() try: role = sagemaker.get_execution_role() except ValueError: iam = boto3.client('iam') role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) print(f"sagemaker role arn: {role}") print(f"sagemaker bucket: {sess.default_bucket()}") print(f"sagemaker session region: {sess.boto_region_name}") # ## Create custom an `inference.py` script # # To use the custom inference script, you need to create an `inference.py` script. In our example, we are going to overwrite the `model_fn` to load our sentence transformer correctly and the `predict_fn` to apply mean pooling. # # We are going to use the [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model. It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. # In[17]: get_ipython().system('mkdir code') # In[38]: get_ipython().run_cell_magic('writefile', 'code/inference.py', '\nfrom transformers import AutoTokenizer, AutoModel\nimport torch\nimport torch.nn.functional as F\n\n# Helper: Mean Pooling - Take attention mask into account for correct averaging\ndef mean_pooling(model_output, attention_mask):\n token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n\n\ndef model_fn(model_dir):\n # Load model from HuggingFace Hub\n tokenizer = AutoTokenizer.from_pretrained(model_dir)\n model = AutoModel.from_pretrained(model_dir)\n return model, tokenizer\n\ndef predict_fn(data, model_and_tokenizer):\n # destruct model and tokenizer\n model, tokenizer = model_and_tokenizer\n \n # Tokenize sentences\n sentences = data.pop("inputs", data)\n encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors=\'pt\')\n\n # Compute token embeddings\n with torch.no_grad():\n model_output = model(**encoded_input)\n\n # Perform pooling\n sentence_embeddings = mean_pooling(model_output, encoded_input[\'attention_mask\'])\n\n # Normalize embeddings\n sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n \n # return dictonary, which will be json serializable\n return {"vectors": sentence_embeddings[0].tolist()}\n') # ## Create `model.tar.gz` with inference script and model # # To use our `inference.py` we need to bundle it into a `model.tar.gz` archive with all our model-artifcats, e.g. `pytorch_model.bin`. The `inference.py` script will be placed into a `code/` folder. We will use `git` and `git-lfs` to easily download our model from hf.co/models and upload it to Amazon S3 so we can use it when creating our SageMaker endpoint. # In[39]: repository = "sentence-transformers/all-MiniLM-L6-v2" model_id=repository.split("/")[-1] s3_location=f"s3://{sess.default_bucket()}/custom_inference/{model_id}/model.tar.gz" # 1. Download the model from hf.co/models with `git clone`. # In[40]: get_ipython().system('git lfs install') get_ipython().system('git clone https://huggingface.co/$repository') # 2. copy `inference.py` into the `code/` directory of the model directory. # In[41]: get_ipython().system('cp -r code/ $model_id/code/') # 3. Create a `model.tar.gz` archive with all the model artifacts and the `inference.py` script. # # In[42]: get_ipython().run_line_magic('cd', '$model_id') get_ipython().system('tar zcvf model.tar.gz *') # 4. Upload the `model.tar.gz` to Amazon S3: # # In[43]: get_ipython().system('aws s3 cp model.tar.gz $s3_location') # ## Create custom `HuggingfaceModel` # # After we have created and uploaded our `model.tar.gz` archive to Amazon S3. Can we create a custom `HuggingfaceModel` class. This class will be used to create and deploy our SageMaker endpoint. # In[44]: from sagemaker.huggingface.model import HuggingFaceModel # create Hugging Face Model Class huggingface_model = HuggingFaceModel( model_data=s3_location, # path to your model and script role=role, # iam role with permissions to create an Endpoint transformers_version="4.26", # transformers version used pytorch_version="1.13", # pytorch version used py_version='py39', # python version used ) # deploy the endpoint endpoint predictor = huggingface_model.deploy( initial_instance_count=1, instance_type="ml.g4dn.xlarge" ) # ## Request Inference Endpoint using the `HuggingfacePredictor` # # The `.deploy()` returns an `HuggingFacePredictor` object which can be used to request inference. # In[45]: data = { "inputs": "the mesmerizing performances of the leads keep the film grounded and keep the audience riveted .", } res = predictor.predict(data=data) print(res) # ### Delete model and endpoint # # To clean up, we can delete the model and endpoint. # In[46]: predictor.delete_model() predictor.delete_endpoint()