2022-09-09 14:13:09 -07:00
< ? php
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
/**
* Purpose
2022-09-12 13:46:09 -07:00
* Shows how to use the AWS SDK for PHP with AWS Glue to:
* 1. Create and run a crawler that crawls a public Amazon Simple Storage
* Service (Amazon S3) bucket and generates a metadata database that describes the
* CSV-formatted data it finds.
* 2. List information about databases and tables in your AWS Glue Data Catalog.
* 3. Create and run a job that extracts CSV data from the source S3 bucket,
* transforms it by removing and renaming fields, and loads JSON-formatted output into
* another Amazon S3 bucket.
* 4. List information about job runs and view some of the transformed data.
* 5. Delete all resources created by the example.
2022-09-09 14:13:09 -07:00
**/
# snippet-start:[php.example_code.glue.basics.scenario]
namespace Glue ;
use Aws\Glue\GlueClient ;
use Aws\S3\S3Client ;
use AwsUtilities\AWSServiceClass ;
use GuzzleHttp\Psr7\Stream ;
2023-02-14 16:31:03 -07:00
use Iam\IAMService ;
2022-09-09 14:13:09 -07:00
class GettingStartedWithGlue
{
public function run ()
{
2023-05-12 05:28:46 -07:00
echo ( " \n " );
2022-09-09 14:13:09 -07:00
echo ( " -------------------------------------- \n " );
2023-05-12 05:28:46 -07:00
print ( " Welcome to the AWS Glue getting started demo using PHP! \n " );
2022-09-09 14:13:09 -07:00
echo ( " -------------------------------------- \n " );
$clientArgs = [
'region' => 'us-west-2' ,
'version' => 'latest' ,
'profile' => 'default' ,
];
$uniqid = uniqid ();
$glueClient = new GlueClient ( $clientArgs );
$glueService = new GlueService ( $glueClient );
2023-02-14 16:31:03 -07:00
$iamService = new IAMService ();
2022-09-09 14:13:09 -07:00
#snippet-start:[php.example_code.glue.basics.crawlerName]
$crawlerName = " example-crawler-test- " . $uniqid ;
#snippet-end:[php.example_code.glue.basics.crawlerName]
AWSServiceClass :: $waitTime = 5 ;
AWSServiceClass :: $maxWaitAttempts = 20 ;
#snippet-start:[php.example_code.glue.basics.getRole]
$role = $iamService -> getRole ( " AWSGlueServiceRole-DocExample " );
#snippet-end:[php.example_code.glue.basics.getRole]
#snippet-start:[php.example_code.glue.basics.databaseName]
$databaseName = " doc-example-database- $uniqid " ;
#snippet-end:[php.example_code.glue.basics.databaseName]
#snippet-start:[php.example_code.glue.basics.createCrawler]
$path = 's3://crawler-public-us-east-1/flight/2016/csv' ;
$glueService -> createCrawler ( $crawlerName , $role [ 'Role' ][ 'Arn' ], $databaseName , $path );
#snippet-end:[php.example_code.glue.basics.createCrawler]
#snippet-start:[php.example_code.glue.basics.startCrawler]
$glueService -> startCrawler ( $crawlerName );
#snippet-end:[php.example_code.glue.basics.startCrawler]
#snippet-start:[php.example_code.glue.basics.getCrawler]
echo " Waiting for crawler " ;
do {
$crawler = $glueService -> getCrawler ( $crawlerName );
echo " . " ;
sleep ( 10 );
} while ( $crawler [ 'Crawler' ][ 'State' ] != " READY " );
echo " \n " ;
#snippet-end:[php.example_code.glue.basics.getCrawler]
#snippet-start:[php.example_code.glue.basics.getDatabase]
$database = $glueService -> getDatabase ( $databaseName );
echo " Found a database named " . $database [ 'Database' ][ 'Name' ] . " \n " ;
#snippet-end:[php.example_code.glue.basics.getDatabase]
//Upload job script
$s3client = new S3Client ( $clientArgs );
$bucketName = " test-glue-bucket- " . $uniqid ;
$s3client -> createBucket ([
'Bucket' => $bucketName ,
'CreateBucketConfiguration' => [ 'LocationConstraint' => 'us-west-2' ],
]);
$s3client -> putObject ([
'Bucket' => $bucketName ,
'Key' => 'run_job.py' ,
2023-02-14 16:31:03 -07:00
'SourceFile' => __DIR__ . '/flight_etl_job_script.py'
2022-09-09 14:13:09 -07:00
]);
$s3client -> putObject ([
'Bucket' => $bucketName ,
'Key' => 'setup_scenario_getting_started.yaml' ,
2023-02-14 16:31:03 -07:00
'SourceFile' => __DIR__ . '/setup_scenario_getting_started.yaml'
2022-09-09 14:13:09 -07:00
]);
#snippet-start:[php.example_code.glue.basics.getTables]
$tables = $glueService -> getTables ( $databaseName );
#snippet-end:[php.example_code.glue.basics.getTables]
#snippet-start:[php.example_code.glue.basics.jobName]
$jobName = 'test-job-' . $uniqid ;
#snippet-end:[php.example_code.glue.basics.jobName]
#snippet-start:[php.example_code.glue.basics.createJob]
$scriptLocation = " s3:// $bucketName /run_job.py " ;
$job = $glueService -> createJob ( $jobName , $role [ 'Role' ][ 'Arn' ], $scriptLocation );
#snippet-end:[php.example_code.glue.basics.createJob]
#snippet-start:[php.example_code.glue.basics.startJobRun]
$outputBucketUrl = " s3:// $bucketName " ;
$runId = $glueService -> startJobRun ( $jobName , $databaseName , $tables , $outputBucketUrl )[ 'JobRunId' ];
#snippet-end:[php.example_code.glue.basics.startJobRun]
#snippet-start:[php.example_code.glue.basics.getJobRun]
echo " waiting for job " ;
do {
$jobRun = $glueService -> getJobRun ( $jobName , $runId );
echo " . " ;
sleep ( 10 );
} while ( ! array_intersect ([ $jobRun [ 'JobRun' ][ 'JobRunState' ]], [ 'SUCCEEDED' , 'STOPPED' , 'FAILED' , 'TIMEOUT' ]));
echo " \n " ;
#snippet-end:[php.example_code.glue.basics.getJobRun]
#snippet-start:[php.example_code.glue.basics.getJobRuns]
$jobRuns = $glueService -> getJobRuns ( $jobName );
#snippet-end:[php.example_code.glue.basics.getJobRuns]
$objects = $s3client -> listObjects ([
'Bucket' => $bucketName ,
])[ 'Contents' ];
foreach ( $objects as $object ) {
echo $object [ 'Key' ] . " \n " ;
}
2024-01-31 08:10:26 -07:00
echo " Downloading " . $objects [ 1 ][ 'Key' ] . " \n " ;
2022-09-09 14:13:09 -07:00
/** @var Stream $downloadObject */
$downloadObject = $s3client -> getObject ([
'Bucket' => $bucketName ,
2024-01-31 08:10:26 -07:00
'Key' => $objects [ 1 ][ 'Key' ],
2022-09-09 14:13:09 -07:00
])[ 'Body' ] -> getContents ();
echo " Here is the first 1000 characters in the object. " ;
echo substr ( $downloadObject , 0 , 1000 );
#snippet-start:[php.example_code.glue.basics.listJobs]
$jobs = $glueService -> listJobs ();
echo " Current jobs: \n " ;
foreach ( $jobs [ 'JobNames' ] as $jobsName ) {
echo " { $jobsName } \n " ;
}
#snippet-end:[php.example_code.glue.basics.listJobs]
#snippet-start:[php.example_code.glue.basics.deleteJob]
echo " Delete the job. \n " ;
$glueClient -> deleteJob ([
'JobName' => $job [ 'Name' ],
]);
#snippet-end:[php.example_code.glue.basics.deleteJob]
#snippet-start:[php.example_code.glue.basics.deleteTable]
echo " Delete the tables. \n " ;
foreach ( $tables [ 'TableList' ] as $table ) {
$glueService -> deleteTable ( $table [ 'Name' ], $databaseName );
}
#snippet-end:[php.example_code.glue.basics.deleteTable]
#snippet-start:[php.example_code.glue.basics.deleteDatabase]
echo " Delete the databases. \n " ;
$glueClient -> deleteDatabase ([
'Name' => $databaseName ,
]);
#snippet-end:[php.example_code.glue.basics.deleteDatabase]
#snippet-start:[php.example_code.glue.basics.deleteCrawler]
echo " Delete the crawler. \n " ;
$glueClient -> deleteCrawler ([
'Name' => $crawlerName ,
]);
#snippet-end:[php.example_code.glue.basics.deleteCrawler]
$deleteObjects = $s3client -> listObjectsV2 ([
'Bucket' => $bucketName ,
]);
echo " Delete all objects in the bucket. \n " ;
$deleteObjects = $s3client -> deleteObjects ([
'Bucket' => $bucketName ,
'Delete' => [
'Objects' => $deleteObjects [ 'Contents' ],
]
]);
echo " Delete the bucket. \n " ;
$s3client -> deleteBucket ([ 'Bucket' => $bucketName ]);
echo " This job was brought to you by the number $uniqid\n " ;
}
}
# snippet-end:[php.example_code.glue.basics.scenario]