Merge pull request #8 from seanorama/master

./platforms/hdp now supports re-use of persistent disks across deployments
GoogleCloudDataproc · Jan 26, 2015 · daf2b46 · daf2b46
2 parents 0ab4588 + bbe30dd
commit daf2b46
Show file tree

Hide file tree

Showing 9 changed files with 178 additions and 38 deletions.
diff --git a/bdutil_env.sh b/bdutil_env.sh
@@ -53,16 +53,16 @@ NUM_WORKERS=2
 # If true, tries to attach the PDs listed in WORKER_ATTACHED_PDS and
 # MASTER_ATTACHED_PD to their respective VMs as a non-boot volume. By default,
 # the PDS will be named after the instance names with a "-pd" suffix.
-USE_ATTACHED_PDS=false
+USE_ATTACHED_PDS=${USE_ATTACHED_PDS:-false}
 
 # Only applicable if USE_ATTACHED_PDS is true; if so, this variable controls
 # whether the PDs should be created explicitly during deployment. The PDs
 # must not already exist.
-CREATE_ATTACHED_PDS_ON_DEPLOY=true
+CREATE_ATTACHED_PDS_ON_DEPLOY=${CREATE_ATTACHED_PDS_ON_DEPLOY:-true}
 
 # Only applicable if USE_ATTACHED_PDS is true; if so, this variable controls
 # whether the PDs should be deleted explicitly when deleting the cluster.
-DELETE_ATTACHED_PDS_ON_DELETE=true
+DELETE_ATTACHED_PDS_ON_DELETE=${DELETE_ATTACHED_PDS_ON_DELETE:-true}
 
 # Only applicable during deployment if USE_ATTACHED_PDS is true and
 # CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of

diff --git a/hadoop-validate-setup.sh b/hadoop-validate-setup.sh
@@ -31,7 +31,7 @@
 #
 
 # Default to 10MB (100k records).
-TERA_GEN_NUM_RECORDS=100000
+TERA_GEN_NUM_RECORDS=${TERA_GEN_NUM_RECORDS:-100000}
 
 # File hadoop-confg.sh
 HADOOP_CONFIGURE_CMD=''
@@ -129,8 +129,10 @@ if [[ ${EXIT_CODE} -ne 0 ]]; then
 fi
 echo 'teragen, terasort, teravalidate passed.'
 
-echo "Cleaning the data created by tests: ${PARENT_DIR}"
+if [ ! "${TERA_CLEANUP_SKIP}" ]; then
+    echo "Cleaning the data created by tests: ${PARENT_DIR}"
 
-CLEANUP_CMD="${HADOOP_CMD} dfs -rmr -skipTrash ${PARENT_DIR}"
-echo ${CLEANUP_CMD}
-eval ${CLEANUP_CMD}
+    CLEANUP_CMD="${HADOOP_CMD} dfs -rmr -skipTrash ${PARENT_DIR}"
+    echo ${CLEANUP_CMD}
+    eval ${CLEANUP_CMD}
+fi
diff --git a/libexec/hadoop_helpers.sh b/libexec/hadoop_helpers.sh
@@ -138,8 +138,9 @@ function start_with_retry_jobtracker() {
 # TODO: Also check HDFS and any other filesystem we expect to work.
 function check_filesystem_accessibility() {
   if (( ${INSTALL_GCS_CONNECTOR} )) ; then
-    local fs_cmd="${HADOOP_INSTALL_DIR}/bin/hadoop fs"
-    if ${fs_cmd} -test -d gs://${CONFIGBUCKET}; then
+    local hdfs_superuser=$(get_hdfs_superuser)
+    local dfs_cmd="sudo -i -u ${hdfs_superuser} hadoop fs"
+    if ${dfs_cmd} -test -d gs://${CONFIGBUCKET}; then
       return 0
     else
       local errcode=$?

diff --git a/platforms/hdp/README.md b/platforms/hdp/README.md
@@ -3,14 +3,20 @@
 Hortonworks Data Platform (HDP) on Google Cloud Platform
 ========================================================
 
-Deploying Hadoop clusters with **Google's bdutil & Apache Ambari**.
+This extension, to Google's [bdutil](https://github.com/GoogleCloudPlatform/bdutil), provides support for deploying the [Hortonworks Data Platform](http://hortonworks.com/) with a single command. 
+
+The extension utilizes Apache Ambari's Blueprint Recommendations to fully configure the cluster without the need for manual configuration.
 
 Resources
 ---------
 
 * [Google documentation](https://cloud.google.com/hadoop/) for bdutil & Hadoop on Google Cloud Platform.
-* [Latest source on Github](https://github.com/GoogleCloudPlatform/bdutil). Use & improve.
-* [Video Tutorial](http://youtu.be/raCtS84Vb6w)
+* [Source on Github](https://github.com/GoogleCloudPlatform/bdutil). Open to the community and welcoming your collaboration.
+
+Video Tutorial
+--------------
+
+[<img src="http://img.youtube.com/vi/raCtS84Vb6w/0.jpg" width="320px" />](http://www.youtube.com/watch?v=raCtS84Vb6w)
 
 Before you start
 ----------------
@@ -19,7 +25,7 @@ Before you start
 
   - open https://console.developers.google.com/
   - sign-in or create an account
-  - The "free trial" [may be used](#common-issues)
+  - The "free trial" [may be used](#questions)
 
 
 #### Create a Google Cloud Project
@@ -121,17 +127,77 @@ For command-line based jobs, 'bdutil' gives methods for passing through commands
 
 For example: `./bdutil shell < ./extensions/google/gcs-validate-setup.sh`
 
-Common issues
--------------
+Questions
+---------
+
+### Can I set/override Hadoop configurations during deployment?
+
+For adding/overriding Hadoop configurations, update `configuration.json` and then  use the extension as documented. And contribute back if you think the defaults should be changed.
+
+### Can I deploy HDP manually using Ambari and/or use my own Ambari Blueprints?
+
+Yes. Set `ambari_manual_env.sh` as your environment _(with the -e switch)_ instead of `ambari_env.sh`. That will configure Ambari across the cluster & handle all HDP prerequisites, but not trigger the Ambari Blueprints which install HDP.
+
+Note that these steps will not be taken for you:
+
+  - initialization of HDFS /user directories _(Check the function `initialize_hdfs_dirs` in `../../libexec/hadoop_helpers.sh`)_
+  - installation of the GCS connector. _(Check `./install_gcs_connector_on_ambari.sh` & `./update_ambari_config.sh`)_
+
+### Can I re-use the attached persistent disk(s) across deployments?
+
+`bdutil` supports keeping persistent disks _(aka `ATTACHED_PDS`)_ online when deleting machines. It can then deploy a new cluster using the same disks without lose of data, **assuming the number of workers is the same**.
 
+The basic commands are below. Find more detail in [TEST.md](./TEST.md).
 
-### 'Free Trial' users or those with limited quota
+```
+## deploy the cluster & create disks
+./bdutil -e platforms/hdp/ambari_env.sh deploy
+
+## delete the cluster but don't delete the disks
+export DELETE_ATTACHED_PDS_ON_DELETE=false
+./bdutil -e platforms/hdp/ambari_env.sh delete
+
+## create with existing disks
+export CREATE_ATTACHED_PDS_ON_DEPLOY=false
+./bdutil -e platforms/hdp/ambari_env.sh deploy
+```
+
+Another would be to use `gs://` _(Google Cloud Storage)_ instead of `hdfs://` in your Hadoop jobs, even setting it as the default. Or backup HDFS to Google Cloud Storage before cluster deletion.
+
+**Note**: Hortonworks can't guarantee the safety of data throughout this process. You should always take care when manipulating disks and have backups where necessary.
+
+### What are the built-in storage options?
+
+By default, HDFS is on **attached disks** _('pd-standard' or 'pd-ssd')_.
+- the size and type can be set in `ambari.conf`
+
+The rest of the system resides on the **local boot disk**, unless configured otherwise.
+
+**Google Cloud Storage** is also available with **`gs://`**. It can be used anywhere that `hdfs://` is available, such as but not limited to mapreduce & `hadoop fs` operations.
+
+  - Note: Adding an additional slash (`gs:///`) will allow you to use the default bucket (defined at cluster build) without needing to specific it.
+
+### Can I deploy in the Google Cloud Platform _Free Trial_ ?
 
 You may use bdutil with HDP by lowering the machine type & count below the recommended specifications. To use the default configuration, upgrade the account from a free trial.
 
   * In 'platforms/hdp/ambari.conf':
-    * GCE_MACHINE_TYPE='n1-standard-2'
-    * WORKERS=3 # or less
+    * `GCE_MACHINE_TYPE='n1-standard-2'`
+    * `WORKERS=3 # or less`
   * Or at the command-line provide these switches to the 'deploy' & 'delete':
     * Deploy cluster: `-n 3 -m n1-standard-2`
 
+Known Issues
+------------
+
+
+Feedback & Issues
+-----------------
+
+ - <http://github.com/seanorama/bdutil/>
+ - <http://twitter.com/seano>
+
+License
+-------
+
+[Apache License, Version 2.0](../../LICENSE)
diff --git a/platforms/hdp/TEST.md b/platforms/hdp/TEST.md
@@ -1,37 +1,93 @@
-### Prep
+## Prep
+
 ```
-CONFIGBUCKET=hdp-play-00
-PROJECT=hdp-play-00
+CONFIGBUCKET=hdp-00
+PROJECT=hdp-00
+switches="-b ${CONFIGBUCKET} -p ${PROJECT}"
 
-switches="-b ${CONFIGBUCKET} -p ${PROJECT}
+# add this to make it a smaller test than the defaults
+switches+="
     --master_attached_pd_size_gb 100
     --worker_attached_pds_size_gb 100
-    -n 4
+    -n 1
     -m n1-standard-2"
 
+
 bdutil="./bdutil ${switches}"
 ```
 
-### Test ambari_env.sh
+## Test ambari_env.sh
 
 ```
 environment=platforms/hdp/ambari_env.sh
-${bdutil} -e ${environment} deploy
+bdutil="${bdutil} -e ${environment}"
+
+## deploy
+${bdutil} deploy
+
+## test
 ${bdutil} shell < ./hadoop-validate-setup.sh
 ${bdutil} shell < ./hadoop-validate-gcs.sh
 ${bdutil} shell < ./extensions/querytools/hive-validate-setup.sh
 ${bdutil} shell < ./extensions/querytools/pig-validate-setup.sh
 #${bdutil} shell < ./extensions/spark/spark-validate-setup.sh
-${bdutil} -e ${environment} delete
+
+## delete
+${bdutil} delete
 ```
 
 
-# Test ambari_manual_env.sh
+## Test ambari_manual_env.sh
+
 ```
 environment=platforms/hdp/ambari_manual_env.sh
-${bdutil} -e ${environment} deploy
+bdutil="${bdutil} -e ${environment}"
+
+## deploy
+${bdutil} deploy
+
+## test
 # need to add an automated test here:
     ${bdutil} shell # do something here like check the appropriate number of hosts in /api/v1/hosts
-${bdutil} -e ${environment} delete
 
+## delete
+${bdutil} delete
+
+```
+
+## Test re-using disks across multiple deployments of same instance count
+
+```
+environment=platforms/hdp/ambari_env.sh
+bdutil="${bdutil} -e ${environment}"
+unset CREATE_ATTACHED_PDS_ON_DEPLOY
+unset DELETE_ATTACHED_PDS_ON_DELETE
+
+## create
+export CREATE_ATTACHED_PDS_ON_DEPLOY=true
+${bdutil} deploy
+
+## generate some data onto HDFS, and dont’ delete it
+echo "hadoop fs -mkdir redeploy-validation.tmp" | ${bdutil} shell
+## if you want more data than that:
+#${bdutil} -u hadoop-validate-setup.sh run_command -- \
+#    sudo -u "$(whoami)" TERA_CLEANUP_SKIP=true TERA_GEN_NUM_RECORDS=100000 ./hadoop-validate-setup.sh
+
+## check that the ‘validate_...’ dir is there
+echo "hadoop fs -ls" | ${bdutil} shell
+
+## delete the cluster but keep disks
+export DELETE_ATTACHED_PDS_ON_DELETE=false
+${bdutil} delete
+
+## create with existing disks
+export CREATE_ATTACHED_PDS_ON_DEPLOY=false
+${bdutil} deploy
+
+## check that the ‘validate_...’ dir is there
+echo "hadoop fs -ls" | ${bdutil} shell
+
+## delete everything to cleanup this testing
+export DELETE_ATTACHED_PDS_ON_DELETE=true
+${bdutil} delete
 ```
diff --git a/platforms/hdp/ambari_env.sh b/platforms/hdp/ambari_env.sh
@@ -50,6 +50,7 @@ COMMAND_GROUPS+=(
   "install-gcs-connector-on-ambari:
      platforms/hdp/install_gcs_connector_on_ambari.sh
   "
+
   "update-ambari-config:
      platforms/hdp/update_ambari_config.sh
   "

diff --git a/platforms/hdp/install_ambari.sh b/platforms/hdp/install_ambari.sh
@@ -28,6 +28,14 @@ setenforce 0
 sed -i 's/\(^[^#]*\)SELINUX=enforcing/\1SELINUX=disabled/' /etc/selinux/config
 sed -i 's/\(^[^#]*\)SELINUX=permissive/\1SELINUX=disabled/' /etc/selinux/config
 
+## workaround as some components of Ambari & the HDP stack are hard
+##   coded to /var/lib/hdfs
+if [ ! -d /hadoop/hdfs ]; then mkdir /hadoop/hdfs; fi
+ln -sf /hadoop/hdfs /var/lib/
+
+## sudo should not require a tty. This is fixed in rhel/centos 7+
+echo 'Defaults !requiretty' > /etc/sudoers.d/888-dont-requiretty
+
 ## disable transparent_hugepages
 cp -a ./thp-disable.sh /usr/local/sbin/
 sh /usr/local/sbin/thp-disable.sh || /bin/true

diff --git a/platforms/hdp/install_ambari_components.sh b/platforms/hdp/install_ambari_components.sh
@@ -100,10 +100,4 @@ ${AMBARI_CURL} -X POST -d @${CLUSTER_TEMPLATE_FILE} \
 loginfo "Waiting for ambari cluster creation to complete (may take awhile)."
 ambari_wait_requests_completed
 
-# Set up HDFS /user directories.
-loginfo "Setting up HDFS /user directories."
-for USER in $(getent passwd | grep '/home' | cut -d ':' -f 1); do
-  echo "Creating HDFS directory for user '${USER}'"
-  sudo -u hdfs hdfs dfs -mkdir -p "/user/${USER}"
-  sudo -u hdfs hdfs dfs -chown "${USER}" "/user/${USER}"
-done
+loginfo "Ambari is now available at http://${PREFIX}-m:8080/"
diff --git a/platforms/hdp/update_ambari_config.sh b/platforms/hdp/update_ambari_config.sh
@@ -12,8 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Makes post-cluster build configuration changes
+# finalize the cluster configuration
 
+source hadoop_helpers.sh
+
+# initialize hdfs dirs
+if [ "${DEFAULT_FS}" = 'hdfs' ]; then
+    loginfo "Set up HDFS /tmp and /user dirs"
+    initialize_hdfs_dirs
+fi
+
+# update hadoop configuration to include the gcs connector
 if (( ${INSTALL_GCS_CONNECTOR} )) ; then
     loginfo "adding /usr/local/lib/hadoop/lib to mapreduce.application.classpath."
     NEW_CLASSPATH=$(/var/lib/ambari-server/resources/scripts/configs.sh get localhost ${PREFIX} mapred-site | grep -E '^"mapreduce.application.classpath"' | tr -d \" | awk '{print "/usr/local/lib/hadoop/lib/*,"$3}' | sed 's/,$//')
@@ -27,4 +36,7 @@ if (( ${INSTALL_GCS_CONNECTOR} )) ; then
         ambari_service_start
         ambari_wait_requests_completed
     done
+
+    # check if GCS is accessible
+    check_filesystem_accessibility
 fi