文档 AWS SDK 示例 GitHub 存储库中还有更多 S AWS DK 示例
本文属于机器翻译版本。若本译文内容与英语原文存在差异,则一律以英文原文为准。
使用 Amazon Textract 的示例 AWS CLI 使用 Bash 脚本
以下代码示例向您展示了如何在 Amazon Textract 中使用 with Bash 脚本来执行操作和实现常见场景。 AWS Command Line Interface
场景是向您演示如何通过在一个服务中调用多个函数或与其他 AWS 服务结合来完成特定任务的代码示例。
每个示例都包含一个指向完整源代码的链接,您可以从中找到有关如何在上下文中设置和运行代码的说明。
主题
场景
以下代码示例展示了如何:
创建 S3 存储桶
将文档上传到 S3
清理资源
- AWS CLI 使用 Bash 脚本
-
注意
还有更多相关信息 GitHub。在 Sample developer tutorials
存储库中查找完整示例,了解如何进行设置和运行。 #!/bin/bash # Amazon Textract Getting Started Tutorial Script # This script demonstrates how to use Amazon Textract to analyze document text set -euo pipefail # Set up logging with restricted permissions LOG_FILE="textract-tutorial.log" touch "$LOG_FILE" chmod 600 "$LOG_FILE" exec > >(tee -a "$LOG_FILE") 2>&1 echo "===================================================" echo "Amazon Textract Getting Started Tutorial" echo "===================================================" echo "This script will guide you through using Amazon Textract to analyze document text." echo "" # Function to check for errors in command output and exit code check_error() { local exit_code=$1 local output=$2 local cmd=$3 if [ $exit_code -ne 0 ] || echo "$output" | grep -i "error" > /dev/null; then echo "ERROR: Command failed: $cmd" echo "$output" | sed 's/\(aws_secret_access_key\|Authorization\|X-Amz-Security-Token\).*/\1=***REDACTED***/g' cleanup_on_error exit 1 fi } # Function to clean up resources on error cleanup_on_error() { echo "Error encountered. Cleaning up resources..." # Clean up temporary JSON files if [ -f "document.json" ]; then rm -f document.json fi if [ -f "features.json" ]; then rm -f features.json fi if [ -n "${DOCUMENT_NAME:-}" ] && [ -n "${BUCKET_NAME:-}" ]; then echo "Deleting document from S3..." aws s3 rm "s3://${BUCKET_NAME}/${DOCUMENT_NAME}" || echo "Failed to delete document" fi if [ -n "${BUCKET_NAME:-}" ] && [ "${BUCKET_IS_SHARED:-false}" = "false" ]; then echo "Deleting S3 bucket..." aws s3 rb "s3://${BUCKET_NAME}" --force || echo "Failed to delete bucket" fi } # Set up trap for cleanup on exit trap cleanup_on_error EXIT # Verify AWS CLI is installed and configured echo "Verifying AWS CLI configuration..." if ! command -v aws &> /dev/null; then echo "ERROR: AWS CLI is not installed." exit 1 fi AWS_CONFIG_OUTPUT=$(aws configure list 2>&1) AWS_CONFIG_STATUS=$? if [ $AWS_CONFIG_STATUS -ne 0 ]; then echo "ERROR: AWS CLI is not properly configured." echo "$AWS_CONFIG_OUTPUT" | sed 's/\(aws_secret_access_key\|Authorization\).*/\1=***REDACTED***/g' exit 1 fi # Verify AWS region is configured and supports Textract AWS_REGION=$(aws configure get region) if [ -z "$AWS_REGION" ]; then echo "ERROR: No AWS region configured. Please run 'aws configure' to set a default region." exit 1 fi # Check if Textract is available in the configured region echo "Checking if Amazon Textract is available in region $AWS_REGION..." TEXTRACT_CHECK=$(aws textract help 2>&1) TEXTRACT_CHECK_STATUS=$? if [ $TEXTRACT_CHECK_STATUS -ne 0 ]; then echo "ERROR: Amazon Textract may not be available in region $AWS_REGION." exit 1 fi # Generate a random identifier for S3 bucket RANDOM_ID=$(openssl rand -hex 6) # Check for shared prereq bucket PREREQ_BUCKET=$(aws cloudformation describe-stacks --stack-name tutorial-prereqs-bucket \ --query 'Stacks[0].Outputs[?OutputKey==`BucketName`].OutputValue' --output text 2>/dev/null || echo "") if [ -n "$PREREQ_BUCKET" ] && [ "$PREREQ_BUCKET" != "None" ]; then BUCKET_NAME="$PREREQ_BUCKET" BUCKET_IS_SHARED=true echo "Using shared bucket: $BUCKET_NAME" else BUCKET_IS_SHARED=false BUCKET_NAME="textract-${RANDOM_ID}" fi DOCUMENT_NAME="document.png" RESOURCES_CREATED=() # Step 1: Create S3 bucket if [ "$BUCKET_IS_SHARED" = false ]; then echo "Creating S3 bucket: $BUCKET_NAME" CREATE_BUCKET_OUTPUT=$(aws s3 mb "s3://$BUCKET_NAME" --region "$AWS_REGION" 2>&1) CREATE_BUCKET_STATUS=$? echo "$CREATE_BUCKET_OUTPUT" check_error $CREATE_BUCKET_STATUS "$CREATE_BUCKET_OUTPUT" "aws s3 mb s3://$BUCKET_NAME" aws s3api put-bucket-tagging \ --bucket "$BUCKET_NAME" \ --tagging 'TagSet=[{Key=project,Value=doc-smith},{Key=tutorial,Value=amazon-textract-gs}]' # Apply security settings to bucket aws s3api put-bucket-versioning --bucket "$BUCKET_NAME" --versioning-configuration Status=Enabled 2>&1 || true aws s3api put-bucket-encryption --bucket "$BUCKET_NAME" --server-side-encryption-configuration '{"Rules": [{"ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}}]}' 2>&1 || true aws s3api put-bucket-acl --bucket "$BUCKET_NAME" --acl private 2>&1 || true RESOURCES_CREATED+=("S3 Bucket: $BUCKET_NAME") fi # Step 2: Check if sample document exists, if not create a simple one if [ ! -f "$DOCUMENT_NAME" ]; then echo "Sample document not found. Generating a sample document..." # Create a simple PNG document using ImageMagick or convert if command -v convert &> /dev/null; then convert -size 400x300 xc:white -pointsize 20 -fill black -draw "text 50,50 'Sample Document'" "$DOCUMENT_NAME" chmod 600 "$DOCUMENT_NAME" echo "Generated sample document: $DOCUMENT_NAME" else # Fallback: create a minimal valid PNG using base64 echo "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" | base64 -d > "$DOCUMENT_NAME" chmod 600 "$DOCUMENT_NAME" echo "Created minimal sample document: $DOCUMENT_NAME" fi fi # Step 3: Upload document to S3 echo "Uploading document to S3..." UPLOAD_OUTPUT=$(aws s3 cp "./$DOCUMENT_NAME" "s3://$BUCKET_NAME/" --sse AES256 2>&1) UPLOAD_STATUS=$? echo "$UPLOAD_OUTPUT" check_error $UPLOAD_STATUS "$UPLOAD_OUTPUT" "aws s3 cp ./$DOCUMENT_NAME s3://$BUCKET_NAME/" RESOURCES_CREATED+=("S3 Object: s3://$BUCKET_NAME/$DOCUMENT_NAME") # Step 4: Analyze document with Amazon Textract echo "Analyzing document with Amazon Textract..." echo "This may take a few seconds..." # Create a JSON file for the document parameter to avoid shell escaping issues cat > document.json << 'EOF' { "S3Object": { "Bucket": "BUCKET_PLACEHOLDER", "Name": "DOCUMENT_PLACEHOLDER" } } EOF sed -i.bak "s|BUCKET_PLACEHOLDER|$BUCKET_NAME|g; s|DOCUMENT_PLACEHOLDER|$DOCUMENT_NAME|g" document.json rm -f document.json.bak chmod 600 document.json # Create a JSON file for the feature types parameter cat > features.json << 'EOF' ["TABLES","FORMS","SIGNATURES"] EOF chmod 600 features.json ANALYZE_OUTPUT=$(aws textract analyze-document --document file://document.json --feature-types file://features.json 2>&1) ANALYZE_STATUS=$? echo "Analysis complete." if [ $ANALYZE_STATUS -ne 0 ]; then echo "ERROR: Document analysis failed" echo "$ANALYZE_OUTPUT" | sed 's/\(aws_secret_access_key\|Authorization\|Token\).*/\1=***REDACTED***/g' exit 1 fi # Save the analysis results to a file with restricted permissions echo "$ANALYZE_OUTPUT" > textract-analysis-results.json chmod 600 textract-analysis-results.json echo "Analysis results saved to textract-analysis-results.json" RESOURCES_CREATED+=("Local file: textract-analysis-results.json") # Display a summary of the analysis echo "" echo "===================================================" echo "Analysis Summary" echo "===================================================" PAGES=$(echo "$ANALYZE_OUTPUT" | grep -o '"Pages": [0-9]*' | head -1 | awk '{print $2}') echo "Document pages: $PAGES" BLOCKS_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType":' | wc -l) echo "Total blocks detected: $BLOCKS_COUNT" # Count different block types using jq if available, fallback to grep PAGE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "PAGE"' | wc -l || echo 0) LINE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "LINE"' | wc -l || echo 0) WORD_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "WORD"' | wc -l || echo 0) TABLE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "TABLE"' | wc -l || echo 0) CELL_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "CELL"' | wc -l || echo 0) KEY_VALUE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "KEY_VALUE_SET"' | wc -l || echo 0) SIGNATURE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "SIGNATURE"' | wc -l || echo 0) echo "Pages: $PAGE_COUNT" echo "Lines of text: $LINE_COUNT" echo "Words: $WORD_COUNT" echo "Tables: $TABLE_COUNT" echo "Table cells: $CELL_COUNT" echo "Key-value pairs: $KEY_VALUE_COUNT" echo "Signatures: $SIGNATURE_COUNT" echo "" # Cleanup confirmation echo "" echo "===================================================" echo "RESOURCES CREATED" echo "===================================================" for resource in "${RESOURCES_CREATED[@]}"; do echo "- $resource" done echo "" echo "===================================================" echo "CLEANUP CONFIRMATION" echo "===================================================" echo "Cleaning up resources..." # Delete document from S3 echo "Deleting document from S3..." DELETE_DOC_OUTPUT=$(aws s3 rm "s3://$BUCKET_NAME/$DOCUMENT_NAME" 2>&1) DELETE_DOC_STATUS=$? echo "$DELETE_DOC_OUTPUT" check_error $DELETE_DOC_STATUS "$DELETE_DOC_OUTPUT" "aws s3 rm s3://$BUCKET_NAME/$DOCUMENT_NAME" # Delete S3 bucket (only if not shared) if [ "$BUCKET_IS_SHARED" = false ]; then echo "Deleting S3 bucket..." DELETE_BUCKET_OUTPUT=$(aws s3 rb "s3://$BUCKET_NAME" --force 2>&1) DELETE_BUCKET_STATUS=$? echo "$DELETE_BUCKET_OUTPUT" check_error $DELETE_BUCKET_STATUS "$DELETE_BUCKET_OUTPUT" "aws s3 rb s3://$BUCKET_NAME --force" fi # Delete local JSON files rm -f document.json features.json echo "Cleanup complete. The analysis results file (textract-analysis-results.json) has been kept." echo "" echo "===================================================" echo "Tutorial complete!" echo "===================================================" echo "You have successfully analyzed a document using Amazon Textract." echo "The analysis results are available in textract-analysis-results.json" echo "" trap - EXIT