根據上期訓練好的模型,將訓練好的模型部署為可供應用程式調用的端點(Endpoint)
SageMaker Endpoints 是將機器學習模型投入生產環境的關鍵服務,提供了高可用性、自動擴展和低延遲的推理能力
sageMaker endpoints可以參考這裡官方文件
SageMaker Endpoints 是 AWS 提供的完全託管的模型推理服務,讓您可以將訓練好的機器學習模型部署為 RESTful API,支援實時推理請求
import boto3
import sagemaker
from sagemaker.model import Model
# 建立 SageMaker 會話
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
# 定義模型
model = Model(
image_uri='your-training-image-uri',
model_data='s3://your-bucket/model-artifacts/model.tar.gz',
role=role,
sagemaker_session=sagemaker_session
)
from sagemaker.model import Model
from datetime import datetime
# 建立端點配置
endpoint_config_name = f'my-model-config-{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}'
endpoint_config = model.create_endpoint_config(
endpoint_config_name=endpoint_config_name,
initial_instance_count=1,
instance_type='ml.m5.large'
)
# 部署端點
endpoint_name = f'my-model-endpoint-{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}'
predictor = model.deploy(
endpoint_name=endpoint_name,
endpoint_config_name=endpoint_config_name,
initial_instance_count=1,
instance_type='ml.m5.large',
wait=True
)
step 1: 準備預訓練模型
import boto3
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
# 設定基本參數
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
# 使用 Hugging Face 預訓練模型
huggingface_model = HuggingFaceModel(
model_data='s3://your-bucket/model.tar.gz', # 或使用 Hugging Face Hub
role=role,
transformers_version='4.21',
pytorch_version='1.12',
py_version='py39'
)
step 2 : 配置端點參數
# 端點配置
endpoint_config = {
'initial_instance_count': 1,
'instance_type': 'ml.m5.large',
'endpoint_name': 'text-classification-endpoint'
}
# 部署模型
predictor = huggingface_model.deploy(**endpoint_config)
step3 : 測試推理
# 準備測試資料
test_data = {
"inputs": "This movie is absolutely fantastic! I loved every minute of it."
}
# 執行推理
result = predictor.predict(test_data)
print(f"預測結果: {result}")
from sagemaker.multidatamodel import MultiDataModel
# 建立多模型端點
multi_model = MultiDataModel(
name='multi-model-endpoint',
model_data_prefix='s3://your-bucket/multi-models/',
image_uri='your-inference-image',
role=role
)
# 部署多模型端點
multi_predictor = multi_model.deploy(
initial_instance_count=1,
instance_type='ml.m5.large'
)
# 使用特定模型進行推理
result = multi_predictor.predict(
data=test_data,
target_model='model-v1.tar.gz'
)
import boto3
# Application Auto Scaling 客戶端
autoscaling_client = boto3.client('application-autoscaling', region_name=region)
# 註冊可擴展目標
response = autoscaling_client.register_scalable_target(
ServiceNamespace='sagemaker',
ResourceId=f'endpoint/{endpoint_name}/variant/AllTraffic',
ScalableDimension='sagemaker:variant:DesiredInstanceCount',
MinCapacity=1,
MaxCapacity=5
)
# 建立擴展政策
scaling_policy = autoscaling_client.put_scaling_policy(
PolicyName='SageMakerEndpointInvocationScalingPolicy',
ServiceNamespace='sagemaker',
ResourceId=f'endpoint/{endpoint_name}/variant/AllTraffic',
ScalableDimension='sagemaker:variant:DesiredInstanceCount',
PolicyType='TargetTrackingScaling',
TargetTrackingScalingPolicyConfiguration={
'TargetValue': 70.0,
'PredefinedMetricSpecification': {
'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance'
},
'ScaleOutCooldown': 300,
'ScaleInCooldown': 300
}
)
這裡要設定 cloudWatch 去追蹤端點效能
import boto3
cloudwatch = boto3.client('cloudwatch')
# 建立自定義指標
cloudwatch.put_metric_data(
Namespace='SageMaker/Endpoints',
MetricData=[
{
'MetricName': 'InvocationLatency',
'Dimensions': [
{
'Name': 'EndpointName',
'Value': endpoint_name
},
],
'Value': response_time,
'Unit': 'Milliseconds'
},
]
)
# 設定 CloudWatch 警報
cloudwatch.put_metric_alarm(
AlarmName=f'{endpoint_name}-HighLatency',
ComparisonOperator='GreaterThanThreshold',
EvaluationPeriods=2,
MetricName='ModelLatency',
Namespace='AWS/SageMaker',
Period=300,
Statistic='Average',
Threshold=1000.0,
ActionsEnabled=True,
AlarmActions=[
'arn:aws:sns:region:account:topic-name'
],
AlarmDescription='Alert when endpoint latency exceeds 1 second',
Dimensions=[
{
'Name': 'EndpointName',
'Value': endpoint_name
},
]
)
根據模型大小選擇實例
instance_types = {
'small_model': 'ml.t2.medium',
'medium_model': 'ml.m5.large',
'large_model': 'ml.c5.2xlarge',
'gpu_model': 'ml.g4dn.xlarge'
}
彈性推理
from sagemaker.model import Model
# 使用 Elastic Inference
model_with_ei = Model(
image_uri=image_uri,
model_data=model_data,
role=role
)
predictor = model_with_ei.deploy(
initial_instance_count=1,
instance_type='ml.m5.large',
accelerator_type='ml.eia2.medium' # 彈性推理加速器
)
批次推理 vs 實時推理
# 對於大批量處理,使用批次轉換
transformer = model.transformer(
instance_count=1,
instance_type='ml.m5.large',
output_path='s3://your-bucket/batch-output/'
)
# 執行批次推理
transformer.transform(
data='s3://your-bucket/batch-input/',
content_type='application/json'
)
vpc endpoints
# 在 VPC 中部署端點
vpc_config = {
'SecurityGroupIds': ['sg-12345678<sg 的id>'],
'Subnets': ['subnet-12345678', 'subnet-87654321']
}
# 部署到 VPC
predictor = model.deploy(
initial_instance_count=1,
instance_type='ml.m5.large',
vpc_config=vpc_config
)
加密設定
from sagemaker.model import Model
# 啟用加密
encrypted_model = Model(
image_uri=image_uri,
model_data=model_data,
role=role,
encrypt_inter_container_traffic=True # 容器間通信加密
)