forked from Smerity/cc-mrjob
-
Notifications
You must be signed in to change notification settings - Fork 65
/
mrjob.conf
42 lines (36 loc) · 1.57 KB
/
mrjob.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
runners:
emr:
# AWS region
# - recommended region: us-east-1 (where Common Crawl data lives)
# - if us-east-1 is used must also specify a bucket (cloud_tmp_dir)
# used to keep job data (must be also in the us-east-1 region)
region: us-east-1
cloud_tmp_dir: s3://my-bucket/tmp/ # <<< set to a bucket in the us-east-1 region you have write permissions
# could also specify the availability zone
#zone: us-east-1c
# Either set the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
# or set the two variables below
#aws_access_key_id: ...
#aws_secret_access_key: ...
# For more control over ssh, it's highly recommended to add your key pair
#ec2_key_pair: ...
#ec2_key_pair_file: ...
#ssh_tunnel: true
#ssh_tunnel_to_job_tracker: true
# Connect to a running EMR cluster to launch the job
# (Note: the running cluster must have all dependencies installed,
# bootstrapping is performed when a cluster is created)
#cluster_id: ...
# ... or run a cluster of 1 master + 2 core instances (all spot instances)
master_instance_type: r3.xlarge
core_instance_type: r3.xlarge
core_instance_bid_price: .15
master_instance_bid_price: .15
num_core_instances: 2
# EMR version (if not the default version):
#image_version: 5.12.0
interpreter: python2.7
bootstrap:
- sudo yum install -y python27 python27-pip python27-devel gcc-c++
- sudo pip-2.7 install boto mrjob simplejson warc
- sudo pip-2.7 install https://github.com/commoncrawl/gzipstream/archive/master.zip