Skip to content

Commit

Permalink
Adding Metrics endpoint to cfn templates (#670)
Browse files Browse the repository at this point in the history
Co-authored-by: Geeta Chauhan <[email protected]>
  • Loading branch information
maaquib and chauhang authored Oct 26, 2020
1 parent 7e173fa commit 56e423a
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 4 deletions.
21 changes: 21 additions & 0 deletions cloudformation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ aws cloudformation create-stack \
"cougar": 0.0022544863168150187
}
]
> curl --insecure "<TorchServeMericsURL>/metrics"
# HELP ts_queue_latency_microseconds Cumulative queue duration in microseconds
# TYPE ts_queue_latency_microseconds counter
ts_queue_latency_microseconds{uuid="e275b494-3d54-45bd-a640-abca741a070b",model_name="squeezenet1_1",model_version="default",} 364.07800000000003
# HELP ts_inference_latency_microseconds Cumulative inference duration in microseconds
# TYPE ts_inference_latency_microseconds counter
ts_inference_latency_microseconds{uuid="e275b494-3d54-45bd-a640-abca741a070b",model_name="squeezenet1_1",model_version="default",} 128010.02100000001
# HELP ts_inference_requests_total Total number of inference requests.
# TYPE ts_inference_requests_total counter
ts_inference_requests_total{uuid="e275b494-3d54-45bd-a640-abca741a070b",model_name="squeezenet1_1",model_version="default",} 4.0
```


Expand Down Expand Up @@ -135,6 +146,16 @@ aws cloudformation create-stack \
"cougar": 0.0022544863168150187
}
]
> curl "<TorchServeMericsURL>/metrics"
# HELP ts_queue_latency_microseconds Cumulative queue duration in microseconds
# TYPE ts_queue_latency_microseconds counter
ts_queue_latency_microseconds{uuid="2b3a4b5b-5131-413a-a725-2abcae5d55ab",model_name="squeezenet1_1",model_version="default",} 932.164
# HELP ts_inference_latency_microseconds Cumulative inference duration in microseconds
# TYPE ts_inference_latency_microseconds counter
ts_inference_latency_microseconds{uuid="2b3a4b5b-5131-413a-a725-2abcae5d55ab",model_name="squeezenet1_1",model_version="default",} 411702.625
# HELP ts_inference_requests_total Total number of inference requests.
# TYPE ts_inference_requests_total counter
ts_inference_requests_total{uuid="2b3a4b5b-5131-413a-a725-2abcae5d55ab",model_name="squeezenet1_1",model_version="default",} 9.0
```

## CloudWatch Logging
Expand Down
51 changes: 49 additions & 2 deletions cloudformation/ec2-asg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,14 +209,26 @@ Resources:
PortRange:
From: '8080'
To: '8080'
InboundSSHPublicNetworkAclEntry:
InboundMetricsPublicNetworkAclEntry:
Type: AWS::EC2::NetworkAclEntry
Properties:
NetworkAclId: !Ref 'PublicNetworkAcl'
RuleNumber: '102'
Protocol: '6'
RuleAction: allow
Egress: 'false'
CidrBlock: '0.0.0.0/0'
PortRange:
From: '8082'
To: '8082'
InboundSSHPublicNetworkAclEntry:
Type: AWS::EC2::NetworkAclEntry
Properties:
NetworkAclId: !Ref 'PublicNetworkAcl'
RuleNumber: '103'
Protocol: '6'
RuleAction: allow
Egress: 'false'
CidrBlock: !Ref 'SSHLocation'
PortRange:
From: '22'
Expand All @@ -225,7 +237,7 @@ Resources:
Type: AWS::EC2::NetworkAclEntry
Properties:
NetworkAclId: !Ref 'PublicNetworkAcl'
RuleNumber: '103'
RuleNumber: '104'
Protocol: '6'
RuleAction: allow
Egress: 'false'
Expand Down Expand Up @@ -269,6 +281,10 @@ Resources:
FromPort: '8081'
ToPort: '8081'
CidrIp: !Ref 'ManagementLocation'
- IpProtocol: tcp
FromPort: '8082'
ToPort: '8082'
CidrIp: '0.0.0.0/0'
- IpProtocol: tcp
FromPort: '22'
ToPort: '22'
Expand Down Expand Up @@ -349,6 +365,19 @@ Resources:
Protocol: HTTP
UnhealthyThresholdCount: 5
VpcId: !Ref 'VPC'
ALBMetricsTargetGroup:
Type: AWS::ElasticLoadBalancingV2::TargetGroup
Properties:
HealthCheckIntervalSeconds: 30
HealthCheckTimeoutSeconds: 5
HealthyThresholdCount: 3
HealthCheckPath: /ping
HealthCheckPort: 8080
HealthCheckProtocol: HTTP
Port: 8082
Protocol: HTTP
UnhealthyThresholdCount: 5
VpcId: !Ref 'VPC'
ALBManagementListener:
Type: AWS::ElasticLoadBalancingV2::Listener
Properties:
Expand All @@ -367,6 +396,15 @@ Resources:
LoadBalancerArn: !Ref 'ApplicationLoadBalancer'
Port: '8080'
Protocol: HTTP
ALBMetricsListener:
Type: AWS::ElasticLoadBalancingV2::Listener
Properties:
DefaultActions:
- Type: forward
TargetGroupArn: !Ref 'ALBMetricsTargetGroup'
LoadBalancerArn: !Ref 'ApplicationLoadBalancer'
Port: '8082'
Protocol: HTTP
TorchServeASG:
Type: AWS::AutoScaling::AutoScalingGroup
DependsOn: GatewayToInternet
Expand All @@ -378,6 +416,7 @@ Resources:
TargetGroupARNs:
- !Ref 'ALBInferenceTargetGroup'
- !Ref 'ALBManagementTargetGroup'
- !Ref 'ALBMetricsTargetGroup'
CreationPolicy:
ResourceSignal:
Timeout: PT30M
Expand Down Expand Up @@ -432,6 +471,7 @@ Resources:
content: !Sub |
inference_address=http://0.0.0.0:8080
management_address=http://0.0.0.0:8081
metrics_address=http://0.0.0.0:8082
load_models=ALL
model_store=/mnt/efs/model_store
mode: '000400'
Expand Down Expand Up @@ -599,3 +639,10 @@ Outputs:
- - http://
- !GetAtt 'ApplicationLoadBalancer.DNSName'
- :8080
TorchServeMericsURL:
Description: Metrics URL for newly created TorchServe stack
Value: !Join
- ''
- - http://
- !GetAtt 'ApplicationLoadBalancer.DNSName'
- :8082
28 changes: 26 additions & 2 deletions cloudformation/ec2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,26 @@ Resources:
PortRange:
From: '8080'
To: '8080'
InboundSSHPublicNetworkAclEntry:
InboundMetricsPublicNetworkAclEntry:
Type: AWS::EC2::NetworkAclEntry
Properties:
NetworkAclId: !Ref 'PublicNetworkAcl'
RuleNumber: '102'
Protocol: '6'
RuleAction: allow
Egress: 'false'
CidrBlock: '0.0.0.0/0'
PortRange:
From: '8082'
To: '8082'
InboundSSHPublicNetworkAclEntry:
Type: AWS::EC2::NetworkAclEntry
Properties:
NetworkAclId: !Ref 'PublicNetworkAcl'
RuleNumber: '103'
Protocol: '6'
RuleAction: allow
Egress: 'false'
CidrBlock: !Ref 'SSHLocation'
PortRange:
From: '22'
Expand All @@ -194,7 +206,7 @@ Resources:
Type: AWS::EC2::NetworkAclEntry
Properties:
NetworkAclId: !Ref 'PublicNetworkAcl'
RuleNumber: '103'
RuleNumber: '104'
Protocol: '6'
RuleAction: allow
Egress: 'false'
Expand Down Expand Up @@ -233,6 +245,10 @@ Resources:
FromPort: '8081'
ToPort: '8081'
CidrIp: !Ref 'ManagementLocation'
- IpProtocol: tcp
FromPort: '8082'
ToPort: '8082'
CidrIp: '0.0.0.0/0'
- IpProtocol: tcp
FromPort: '22'
ToPort: '22'
Expand All @@ -253,6 +269,7 @@ Resources:
content: !Sub |
inference_address=https://0.0.0.0:8080
management_address=https://0.0.0.0:8081
metrics_address=https://0.0.0.0:8082
private_key_file=/etc/torchserve/server.key
certificate_file=/etc/torchserve/server.pem
mode: '000400'
Expand Down Expand Up @@ -421,3 +438,10 @@ Outputs:
- - https://
- !GetAtt 'TorchServeInstance.PublicDnsName'
- :8080
TorchServeMetricsURL:
Description: Metrics URL for newly created TorchServe stack
Value: !Join
- ''
- - https://
- !GetAtt 'TorchServeInstance.PublicDnsName'
- :8082

0 comments on commit 56e423a

Please sign in to comment.