Skip to content

Commit

Permalink
Add original code from encoded checkfiles
Browse files Browse the repository at this point in the history
  • Loading branch information
caseylitton committed Nov 28, 2017
1 parent 415e7d9 commit dce695b
Show file tree
Hide file tree
Showing 8 changed files with 1,405 additions and 2 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,11 @@ ENV/

# mypy
.mypy_cache/


# Added from encoded repo
/bin/
/include/
/lib/
pip-selfcheck.json
pyvenv.cfg
26 changes: 24 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,24 @@
# checkfiles
Files are checked to see if the MD5 sum (both for gzipped and ungzipped) is identical to the submitted metadata, as well as run through the validateFiles program from jksrc.
Check Files
===========

Files are checked to see if the MD5 sum (both for gzipped and ungzipped) is identical to the submitted metadata, as well as run through
the validateFiles program from jksrc (http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/validateFiles).
It operates on files in the 'uploading' state (according to the encodeD database) in the encode-files S3 bucket.
Checkfiles is used by the ENCODE DCC to validate genomic datafiles submitted by labs.
The bucket itself is mounted using Goofys (https://github.com/kahing/goofys).
Errors are reported back to encodeD.

Setup
-----

Install required packages for running deploy::

pyvenv .
bin/pip install -r requirements-deploy.txt

Deploy
------

Supply arguments for checkfiles after a ``--`` separator::

bin/python deploy.py -- --username ACCESS_KEY_ID --password SECRET_ACCESS_KEY --bot-token SLACK-BOT-TOKEN https://www.encodeproject.org
1,136 changes: 1,136 additions & 0 deletions checkfiles.py

Large diffs are not rendered by default.

95 changes: 95 additions & 0 deletions cloud-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#cloud-config

# Launch instance with network interfaces configured to the local IP addresses we references in the config.
# $ aws ec2 run-instances --user-data file://check-files.yml --iam-instance-profile Name="encoded-instance" --image-id ami-5a928a3b --region us-west-2 --security-groups ssh-http-https --instance-type c4.xlarge

bootcmd:
- cloud-init-per once ssh-users-ca echo "TrustedUserCAKeys /etc/ssh/users_ca.pub" >> /etc/ssh/sshd_config

output:
all: '| tee -a /var/log/cloud-init-output.log'

packages:
- fuse
## - golang
- git
- awscli
- curl
- ntp
- python3-dev
- python3-venv

## power_state:
## mode: poweroff

runcmd:

- set -ex
- systemctl daemon-reload # See https://bugs.launchpad.net/cloud-init/+bug/1449318
- mkdir -p /s3/encode-files
- mkdir -p /s3/encoded-files-dev

- curl -sS -L -o /usr/local/bin/goofys https://github.com/kahing/goofys/releases/download/v0.0.5/goofys
- chmod +x /usr/local/bin/goofys

## - mkdir -p /opt/goofys/bin
## - chown -R build:build /opt/goofys
## - sudo -u build GOPATH=/opt/goofys go get github.com/kahing/goofys
## - mv /opt/goofys/bin/goofys /usr/local/bin/

- mount -a

- mkdir /opt/encValData
- chown build:build /opt/encValData
- sudo -u build git clone --depth 1 https://github.com/ENCODE-DCC/encValData /opt/encValData

- curl -sS -L -o /usr/local/bin/validateFiles http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/validateFiles
- chmod +x /usr/local/bin/validateFiles

- mkdir /opt/encoded
- chown build:build /opt/encoded
- sudo -u build git clone --no-checkout https://github.com/ENCODE-DCC/encoded.git /opt/encoded
- sudo -u build git -C /opt/encoded checkout %(COMMIT)s
- chmod +x /opt/encoded/checkfiles/script

- cd /opt/encoded/checkfiles
- sudo -u build pyvenv .
- sudo -u build bin/pip install -r requirements.txt

- cd /home/ubuntu
- nohup /opt/encoded/checkfiles/script $(cat /opt/checkfiles_args.txt) 2> errors.log 1> output.log &

users:
- default
- name: build
gecos: Build user
inactive: true
system: true

mounts:
- [ "goofys-ulimit#encode-files", "/s3/encode-files", "fuse", "_netdev,allow_other,--file-mode=0444,--dir-mode=0555,--stat-cache-ttl=0", "0", "0" ]
- [ "goofys-ulimit#encoded-files-dev", "/s3/encoded-files-dev", "fuse", "_netdev,allow_other,--file-mode=0444,--dir-mode=0555,--stat-cache-ttl=0", "0", "0" ]

write_files:
- path: /opt/checkfiles_args.txt
content: |
%(ARGS)s
- path: /etc/ssh/users_ca.pub
content: ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAv/ymOcnN4LhM4NACc3Or116XXJ6KytuOgB/+1qNkOFBqBosrn7cmJ35rsoNHRgYNrCsRE9ch74RKsN6H72FtSJgBhGh/9oUK7Os6Fqt3/ZZXxgxIx6ubs/MTgrxrAnujiBxUXMXQhLKMriNMpo8mt4nGYVtLk9PBjiyfncaS8H9ZKoNio9dhP8bmTuYvioAI35dqKdSlVLyzr/XkZxia8Ki+pQ0N6uuiEwMR3ToM+LSp8wpFOOAiu4PEAujRW7us/+1hlpKWfn0J7/V3826joHE+I967Vg/+ikcVhF77JjK1nib879VgCWfmn1HPQosIpk4yJfVgGvRVI7I2nfBPVw== [email protected]
- path: /etc/systemd/system/cloud-final.service.d/override.conf
content: |
[Service]
# See https://bugs.launchpad.net/cloud-init/+bug/1449318
KillMode=process
- path: /usr/local/bin/goofys-ulimit
permissions: 0755
content: |
#!/bin/sh
ulimit -n 60000
exec goofys "$@"
- path: /etc/rc.local
permissions: 0755
content: |
#!/bin/sh -e
cd /home/ubuntu; nohup /opt/encoded/checkfiles/script %(ARGS)s 2> errors.log 1> output.log &
exit 0
127 changes: 127 additions & 0 deletions deploy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import boto3
import getpass
import re
import shlex
import subprocess
import sys

BDM = [
{
'DeviceName': '/dev/sda1',
'Ebs': {
'VolumeSize': 512,
'VolumeType': 'gp2',
'DeleteOnTermination': True
}
}
]


def nameify(s):
name = ''.join(c if c.isalnum() else '-' for c in s.lower()).strip('-')
return re.subn(r'\-+', '-', name)[0]


def run(image_id, instance_type,
branch=None, name=None, profile_name=None, args=()):
if branch is None:
branch = subprocess.check_output(
['git', 'rev-parse', '--abbrev-ref', 'HEAD']
).decode('utf-8').strip()

commit = subprocess.check_output(
['git', 'rev-parse', '--short', branch]).decode('utf-8').strip()

if not subprocess.check_output(
['git', 'branch', '-r', '--contains', commit]).strip():
print("Commit %r not in origin. Did you git push?" % commit)
sys.exit(1)

username = getpass.getuser()

if name is None:
name = nameify('checkfiles-%s-%s-%s' % (branch, commit, username))

session = boto3.Session(region_name='us-west-2', profile_name=profile_name)
ec2 = session.resource('ec2')

domain = 'production' if profile_name == 'production' else 'instance'

if any(ec2.instances.filter(
Filters=[
{'Name': 'tag:Name', 'Values': [name]},
{'Name': 'instance-state-name',
'Values': ['pending', 'running', 'stopping', 'stopped']},
])):
print('An instance already exists with name: %s' % name)
sys.exit(1)

user_data = subprocess.check_output(
['git', 'show', commit + ':checkfiles/cloud-config.yml']
).decode('utf-8')
user_data = user_data % {
'COMMIT': commit,
'ARGS': ' '.join(shlex.quote(arg) for arg in args),
}

reservation = ec2.create_instances(
MinCount=1,
MaxCount=1,
ImageId=image_id,
InstanceType=instance_type,
SecurityGroups=['ssh-http-https'],
BlockDeviceMappings=BDM,
UserData=user_data,
InstanceInitiatedShutdownBehavior='terminate',
IamInstanceProfile={'Name': 'encoded-instance'},
)

instance = reservation[0] # Instance:i-34edd56f
print('%s.%s.encodedcc.org' % (instance.instance_id, domain))
instance.wait_until_exists()
instance.create_tags(Tags=[
{'Key': 'Name', 'Value': name},
{'Key': 'branch', 'Value': branch},
{'Key': 'commit', 'Value': commit},
{'Key': 'started_by', 'Value': username},
])
print('ssh %s.%s.encodedcc.org' % (name, domain))
print('pending...')
instance.wait_until_running()
print(instance.state['Name'])


def main():
import argparse

def hostname(value):
if value != nameify(value):
raise argparse.ArgumentTypeError(
"%r is an invalid hostname, only [a-z0-9] and hyphen allowed."
% value)
return value

parser = argparse.ArgumentParser(
description="Deploy checkfiles on AWS",
)
parser.add_argument(
'-b', '--branch', default=None, help="Git branch or tag")
parser.add_argument(
'-n', '--name', type=hostname, help="Instance name")
parser.add_argument(
'--image-id', default='ami-4b37d42b',
help="ubuntu/images/hvm-ssd/ubuntu-wily-15.10-amd64-server-20160217.1")
parser.add_argument(
'--instance-type', default='c4.xlarge',
help="specify 'c4.8xlarge' if there are many files to check")
parser.add_argument(
'--profile-name', default=None, help="AWS creds profile")
parser.add_argument(
'args', metavar='ARG', nargs='*', help="arguments for checkfiles")
args = parser.parse_args()

return run(**vars(args))


if __name__ == '__main__':
main()
6 changes: 6 additions & 0 deletions requirements-deploy.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
boto3==1.2.4
botocore==1.3.28
docutils==0.12
jmespath==0.9.0
python-dateutil==2.4.2
six==1.10.0
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests==2.9.1
slackclient==1.0.6
7 changes: 7 additions & 0 deletions script
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
while :
do
now=$(date +"%Y%m%d%H%M%S")
/opt/encoded/checkfiles/bin/python /opt/encoded/checkfiles/checkfiles.py "$@" --out $now-checkfiles.log --err $now-checkfiles-error.log --include-unexpired-upload 2> $now-errors.log 1> $now-output.log
sleep 480m
done

0 comments on commit dce695b

Please sign in to comment.