2020-08-07 17:39:33 -07:00
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Purpose
Shows how to copy a shell script to Amazon EMR cluster instances and run them
2021-09-22 16:13:37 -07:00
to install additional libraries on the instances. This can be used to automate
2020-08-20 16:10:21 -07:00
instance management and is an alternative to connecting through SSH to run the
2020-08-07 17:39:33 -07:00
script manually.
"""
# snippet-start:[emr.python.jupyterhub.installlibraries]
import argparse
import time
import boto3
2023-10-18 10:35:05 -07:00
def install_libraries_on_core_nodes ( cluster_id , script_path , emr_client , ssm_client ) :
2020-08-07 17:39:33 -07:00
"""
Copies and runs a shell script on the core nodes in the cluster.
:param cluster_id: The ID of the cluster.
:param script_path: The path to the script, typically an Amazon S3 object URL.
:param emr_client: The Boto3 Amazon EMR client.
2020-08-20 16:10:21 -07:00
:param ssm_client: The Boto3 AWS Systems Manager client.
2020-08-07 17:39:33 -07:00
"""
core_nodes = emr_client . list_instances (
2023-10-18 10:35:05 -07:00
ClusterId = cluster_id , InstanceGroupTypes = [ " CORE " ]
) [ " Instances " ]
core_instance_ids = [ node [ " Ec2InstanceId " ] for node in core_nodes ]
2020-08-07 17:39:33 -07:00
print ( f " Found core instances: { core_instance_ids } . " )
commands = [
# Copy the shell script from Amazon S3 to each node instance.
f " aws s3 cp { script_path } /home/hadoop " ,
# Run the shell script to install libraries on each node instance.
2023-10-18 10:35:05 -07:00
" bash /home/hadoop/install_libraries.sh " ,
]
2020-08-07 17:39:33 -07:00
for command in commands :
print ( f " Sending ' { command } ' to core instances... " )
command_id = ssm_client . send_command (
InstanceIds = core_instance_ids ,
2023-10-18 10:35:05 -07:00
DocumentName = " AWS-RunShellScript " ,
2020-08-07 17:39:33 -07:00
Parameters = { " commands " : [ command ] } ,
2023-10-18 10:35:05 -07:00
TimeoutSeconds = 3600 ,
) [ " Command " ] [ " CommandId " ]
2020-08-07 17:39:33 -07:00
while True :
# Verify the previous step succeeded before running the next step.
2023-10-18 10:35:05 -07:00
cmd_result = ssm_client . list_commands ( CommandId = command_id ) [ " Commands " ] [ 0 ]
if cmd_result [ " StatusDetails " ] == " Success " :
2020-08-07 17:39:33 -07:00
print ( f " Command succeeded. " )
break
2023-10-18 10:35:05 -07:00
elif cmd_result [ " StatusDetails " ] in [ " Pending " , " InProgress " ] :
2020-08-07 17:39:33 -07:00
print ( f " Command status is { cmd_result [ ' StatusDetails ' ] } , waiting... " )
time . sleep ( 10 )
else :
print ( f " Command status is { cmd_result [ ' StatusDetails ' ] } , quitting. " )
raise RuntimeError (
f " Command { command } failed to run. "
2023-10-18 10:35:05 -07:00
f " Details: { cmd_result [ ' StatusDetails ' ] } "
)
2020-08-07 17:39:33 -07:00
def main ( ) :
parser = argparse . ArgumentParser ( )
2023-10-18 10:35:05 -07:00
parser . add_argument ( " cluster_id " , help = " The ID of the cluster. " )
parser . add_argument ( " script_path " , help = " The path to the script in Amazon S3. " )
2020-08-07 17:39:33 -07:00
args = parser . parse_args ( )
2023-10-18 10:35:05 -07:00
emr_client = boto3 . client ( " emr " )
ssm_client = boto3 . client ( " ssm " )
2020-08-07 17:39:33 -07:00
install_libraries_on_core_nodes (
2023-10-18 10:35:05 -07:00
args . cluster_id , args . script_path , emr_client , ssm_client
)
2020-08-07 17:39:33 -07:00
2023-10-18 10:35:05 -07:00
if __name__ == " __main__ " :
2020-08-07 17:39:33 -07:00
main ( )
# snippet-end:[emr.python.jupyterhub.installlibraries]