#!/usr/bin/python
# Copyright 2011  Lars Wirzenius
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import cliapp
import ConfigParser
import logging
import os
import re
import shutil
import subprocess
import tempfile
import time


__version__ = '1.17'


class Measurement(object):

    def __init__(self, time_output):
        fields = [float(x) for x in time_output.splitlines()]
        self.user = fields[0]
        self.system = fields[1]
        self.real = fields[2]
        self.maxrss = fields[3]
        self.new_data = 0
        self.repo_size_after = 0
        self.repo_bytes_written = 0
        self.repo_bytes_read = 0
        self.repo_roundtrips = 0


def runcmd(argv, **kwargs):
    logging.debug('run: %s %s' % (argv, kwargs))
    fd, timings = tempfile.mkstemp()
    time_argv = ['/usr/bin/time', 
                 '-o', timings, 
                 '--format', '%U\n%S\n%e\n%M']
    p = subprocess.Popen(time_argv + argv, **kwargs)
    out, err = p.communicate()
    os.remove(timings)
    data = os.read(fd, 1024**2)
    os.close(fd)
    if p.returncode != 0:
        raise cliapp.AppException('command failed: %s\n%s' % (argv, err))

    return Measurement(data), out


# Clear Linux kernel buffer and inode caches.
# See http://linux-mm.org/Drop_Caches for details.
def drop_caches():
    def sudo_tee(status):
        p = subprocess.Popen(['sudo', '-p', 'Password (for clearing cache): ',
                              'tee', '/proc/sys/vm/drop_caches'],
                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        out, err = p.communicate('%s\n' % status)
        if p.returncode != 0:
            raise cliapp.AppException('failed to clear cache')

    logging.debug('clearing Linux kernel cache')
    sudo_tee(3)


class BackupProgram(object):

    name = None
    
    def __init__(self, live_data, repo, settings):
        self.live_data = live_data
        self.repo = repo
        self.settings = settings

    def set_meta(self, cp):
        '''Set [meta] fields in report.
        
        These might depend on the program. For example, if running
        something from a version control branch, a subclass might
        record the revision id here.
        
        '''

    def prepare(self):
        '''Prepare program for benchmark.
        
        This might, for example, compile it.
        
        '''

    def backup(self, nth_gen):
        '''Run a backup from live_data to repo.
        
        This should start a new generation, in whatever way is 
        most appropriate for the backup program.
        
        '''

    def fsck(self, nth_gen):
        '''Run fsck on the repository, after the nth backup generation.'''

    def verify(self, nth_gen):
        '''Run verify on the repository, after the nth backup generation.'''
        
    def list_files(self, nth_gen):
        '''This should retrieve a list of all files in a generation.
        
        The list should be written to /dev/null.
        
        '''
        
    def restore(self, nth_gen, target_dir):
        '''Restore all files in a generation, to a target directory.'''
        
    def forget(self, nth_gen):
        '''Remove a given generation.'''


class Obnam(BackupProgram):

    name = 'obnam'
    
    @property
    def _cmd(self):
        if self.settings['obnam-branch']:
            return './obnam'
        else:
            return 'obnam'
    
    @property
    def _branch(self):
        return self.settings['obnam-branch'] or None

    @property
    def _revno(self):
        timings, revno = runcmd(['bzr', 'revno'], cwd=self._branch,
                                stdout=subprocess.PIPE)
        return revno.strip()
    
    @property
    def _larch_branch(self):
        return self.settings['larch-branch'] or None

    @property
    def _larch_revno(self):
        timings, revno = runcmd(['bzr', 'revno'], cwd=self._larch_branch,
                                stdout=subprocess.PIPE)
        return revno.strip()

    def _run(self, args, nth_gen, **kwargs):
        fd, logfile = tempfile.mkstemp()
        os.close(fd)
        cmd = [self._cmd, 
               '--no-default-configs',
               '--log', logfile,
               '--repository', self.repo,
               '--sftp-delay', str(self.settings['sftp-delay']),
               '--leave-checkpoints',
               '--weak-random']
        if self.settings['obnam-config']:
            cmd.extend(['--config', self.settings['obnam-config']])
        env = dict(os.environ)
        if self.settings['obnam-profile']:
            namepattern = {
                'gen': str(nth_gen),
                'op': args[0],
            }
            env['OBNAM_PROFILE'] = self.settings['obnam-profile'] % namepattern
        if self._larch_branch:
            old = env.get('PYTHONPATH')
            if old:
                new = '%s:%s' % (self._larch_branch, old)
            else:
                new = self._larch_branch
            env['PYTHONPATH'] = new
        if self.settings['encrypt-with']:
            cmd += ['--encrypt-with', self.settings['encrypt-with']]
        result = runcmd(cmd + args, cwd=self._branch, env=env, **kwargs)

        self.extract_repository_io(result[0], logfile)
        os.remove(logfile)

        return result

    def extract_repository_io(self, measurement, logfile):
    
        pat = re.compile(r' VFS:( __del__:)? baseurl=.* '
                         r'read=(?P<read>\d+) written=(?P<written>\d+)')
        pat2 = re.compile(r' VFS: baseurl=.* roundtrips=(?P<n>\d+)')
    
        f = open(logfile)
        for line in f:
            m = pat.search(line)
            m2 = pat2.search(line)
            if m and self.repo in line:
                measurement.repo_bytes_written = long(m.group('written'))
                measurement.repo_bytes_read = long(m.group('read'))
            elif m2 and self.repo in line:
                measurement.repo_roundtrips = long(m2.group('n'))
        f.close()

    def prepare(self):
        if self._branch:
            if os.path.exists(os.path.join(self._branch, 'Makefile')):
                logging.info('Building obnam in %s with make' % self._branch)
                runcmd(['make'], cwd=self._branch)
            else:
                logging.info('Building obnam in %s with setup.py' % 
                             self._branch)
                runcmd(['python', 'setup.py', 'build_ext', '-i'])
    
    def backup(self, nth_gen):
        return self._run(['backup', self.live_data], nth_gen)[0]

    def fsck(self, nth_gen):
        return self._run(['fsck'], nth_gen)[0]

    def verify(self, nth_gen):
        return self._run(['verify', self.live_data], nth_gen)[0]
   
    def _genid(self, nth_gen):
        timings, out = self._run(['genids'], nth_gen, stdout=subprocess.PIPE)
        return out.splitlines()[nth_gen]
   
    def list_files(self, nth_gen):
        devnull = os.open('/dev/null', os.O_WRONLY)
        timings, out = self._run(['ls', self._genid(nth_gen)], nth_gen, 
                                 stdout=devnull)
        os.close(devnull)
        return timings

    def restore(self, nth_gen, target_dir):
        return self._run(['restore', '--to', target_dir, 
                           '--generation', self._genid(nth_gen)], nth_gen)[0]
    
    def forget(self, nth_gen, gen):
        return self._run(['forget', self._genid(gen)], nth_gen)[0]
    
    def set_meta(self, cp):
        if self._branch:
            cp.set('meta', 'obnam-branch', self._branch)
            cp.set('meta', 'obnam-branch-nick', os.path.basename(self._branch))
            cp.set('meta', 'revision', self._revno)
        if self._larch_branch:
            cp.set('meta', 'larch-revision', self._larch_revno)
        cp.set('meta', 'sftp-delay', str(self.settings['sftp-delay']))


class BackupProgramFactory(object):

    programs = [Obnam]
    
    def names(self):
        return [p.name for p in self.programs]

    def new(self, name, **kwargs):
        for p in self.programs:
            if p.name == name:
                return p(**kwargs)


class Report(object):

    def __init__(self, program):
        self.program = program
        self.measurements = dict()
        
    @property
    def generations(self):
        gens = set()
        for op in self.measurements:
            for gen in self.measurements[op]:
                gens.add(gen)
        return gens

    @property
    def operations(self):
        return self.measurements.keys()
        
    def add_measurement(self, op, gen, measurement):
        if op not in self.measurements:
            self.measurements[op] = dict()
        self.measurements[op][gen] = measurement

    def get_measurement(self, op, gen):
        return self.measurements[op][gen]
        
    def format(self, fp):
        cp = ConfigParser.ConfigParser()

        cp.add_section('meta')
        cp.set('meta', 'program', self.program.name)
        if self.program.settings['encrypt-with']:
            cp.set('meta', 'encrypted', 'yes')
        else:
            cp.set('meta', 'encrypted', 'no')
        if self.program.settings['description']:
            cp.set('meta', 'description', self.program.settings['description'])
        if self.program.settings['profile-name']:
            cp.set('meta', 'profile-name', 
                   self.program.settings['profile-name'])
        self.program.set_meta(cp)
        
        for gen in self.generations:
            section = str(gen)
            cp.add_section(section)
            for op in self.operations:
                m = self.get_measurement(op, gen)
                for field in ['user', 'system', 'real', 'maxrss']:
                    cp.set(section, '%s.%s' % (op, field), 
                           '%.1f' % getattr(m, field))
                cp.set(section, '%s.new-data' % op, m.new_data)
                cp.set(section, '%s.repo-size-after' % op, m.repo_size_after)
                cp.set(section, '%s.repo-bytes-written' % op, 
                       m.repo_bytes_written)
                cp.set(section, '%s.repo-bytes-read' % op, 
                       m.repo_bytes_read)
                cp.set(section, '%s.repo-roundtrips' % op,
                       m.repo_roundtrips)
        
        cp.write(fp)


class Seivot(cliapp.Application):

    def add_settings(self):
        self.factory = BackupProgramFactory()

        self.settings.choice(['program'], self.factory.names(),
                             'program to benchmark (%default)')
        self.settings.string(['description'],
                             'free-form description of this backup run',
                             metavar='TEXT')
        self.settings.string(['profile-name'],
                             'name of backup use-case profile name '
                                '(for documentation purposes only)')

        self.settings.integer(['generations'],
                              'total number of generations to '
                                'measure (%default)',
                               metavar='COUNT',
                               default=5)
        self.settings.bytesize(['initial-data'],
                               'size of initial live data (%default)',
                               metavar='SIZE',
                               default=1024)
        self.settings.bytesize(['incremental-data'],
                               'add SIZE live data for '
                                    'additional generations '
                                    '(%default)',
                               metavar='SIZE',
                               default=1024)
        self.settings.string(['use-existing'],
                             'use exiting DIR for initial generation',
                             metavar='DIR')
        self.settings.bytesize(['file-size'], 'size of files to create',
                               default=16*1024)
        self.settings.string(['obnam-branch'],
                             'bzr branch from which to run obnam '
                                '(default is installed obnam)')
        self.settings.string(['larch-branch'],
                             'bzr branch from which to use larch '
                                '(default is installed larch)')
        self.settings.string(['obnam-profile'],
                             'store Python profiling output '
                                'in files named after NAMEPATTERN '
                                '(no profiling, unless set); '
                                '%(foo)s in pattern gets filled '
                                'in, where foo is op (for '
                                'backup/restore/etc), gen, or '
                                'order (cumulative/time)',
                              metavar='NAMEPATTERN',
                              default='')
        self.settings.string(['encrypt-with'],
                             'encrypt backups with KEYID',
                             metavar='KEYID')
                                         
        self.settings.boolean(['drop-caches'],
                             'clear Linux kernel cache before '
                               'running commands (will ask for sudo pasword')

        self.settings.boolean(['use-sftp-repository'],
                              'access backup repository over the '
                                'network via sftp')
        self.settings.boolean(['use-sftp-root'],
                              'access backup repository over the '
                                'network via sftp')
        self.settings.integer(['sftp-delay'],
                              'add an artificial delay (in milliseconds) to '
                                'all SFTP transfers')

        self.settings.boolean(['verify'], 'verify restored data')

        self.settings.boolean(['benchmark-fsck'], 'benchmark fsck?')
        
        self.settings.string(['obnam-config'], 
                             'use FILE as an Obnam configuration file',
                             metavar='FILE')

    def process_args(self, args):
        progname = self.settings['program']
        logging.info('Benchmarking: %s' % progname)
        
        generations = self.settings['generations']

        self.tempdir = tempfile.mkdtemp()
        logging.info('tempdir: %s' % self.tempdir)
        self.live_data = os.path.join(self.tempdir, 'data')
        self.repo = os.path.join(self.tempdir, 'repo')

        if self.settings['use-sftp-root']:
            live_url = 'sftp://localhost%s' % self.live_data
        else:
            live_url = self.live_data

        if self.settings['use-sftp-repository']:
            repo_url = 'sftp://localhost%s' % self.repo
        else:
            repo_url = self.repo

        prog = self.factory.new(progname, live_data=live_url,
                                repo=repo_url,
                                settings=self.settings)
        prog.prepare()
        
        self.report = Report(prog)

        if self.settings['use-existing']:
            print 'Copying existing data to %s' % self.live_data
            self.runcmd(['cp', '-a', self.settings['use-existing'], 
                         self.live_data])
            amount = self.file_sizes(self.live_data)
        else:
            amount = self.settings['initial-data']
            self.generate_live_data(self.live_data, amount)
        if self.settings['verify']:
            self.summain(self.live_data, 'backup-0.summain')
        self.measure(prog.backup, 0, amount)
        if self.settings['benchmark-fsck']:
            self.measure(prog.fsck, 0, amount)
        self.measure(prog.verify, 0, amount)

        for i in range(1, generations):
            self.generate_live_data(self.live_data,
                                    self.settings['incremental-data'])
            if self.settings['verify']:
                self.summain(self.live_data, 'backup-%d.summain' % i)
            self.measure(prog.backup, i, self.settings['incremental-data'])
            if self.settings['benchmark-fsck']:
                self.measure(prog.fsck, i, self.settings['incremental-data'])
            self.measure(prog.verify, i, self.settings['incremental-data'])

        for i in range(generations):
            self.measure(prog.list_files, i, 0)

        for i in range(generations):
            target_dir = os.path.join(self.tempdir, 'restored')
            os.mkdir(target_dir)
            self.measure(prog.restore, i, 0, target_dir=target_dir)
            if self.settings['verify']:
                self.verify(target_dir, i)
            shutil.rmtree(target_dir)

        for i in range(generations):
            # Since we remove oldest first, we always remove the 0th
            # generation, not the ith one.
            self.measure(prog.forget, i, 0, gen=0)

        self.report.format(self.output)

    def generate_live_data(self, where, size):
        logging.info('Generating %d bytes live data' % size)
        runcmd(['genbackupdata', where, '--create', str(size),
                '--file-size', str(self.settings['file-size'])])

    def summain(self, dirname, basename):
        '''Remember state of dirname at this time.
        
        This runs the summain(1) utility against dirname and stores
        the result in a file called basename in the temporary directory.
        
        '''
        
        # We exclude mtime from summain output, because there are very
        # small time differences that I choose to ignore at this time.
        self.runcmd(['summain', '--relative', '--exclude=mtime', '--output',
                     os.path.join(self.tempdir, basename), dirname])

    def verify(self, dirname, generation):
        '''Verify that the generation was restored correctly.'''

        root = os.path.join(dirname, './' + self.live_data)
        self.summain(root, 'restored-%d.summain' % generation)
        orig = os.path.join(self.tempdir, 'backup-%d.summain' % generation)
        rest = os.path.join(self.tempdir, 'restored-%d.summain' % generation)
        self.runcmd(['diff', '-u', orig, rest])

    def file_sizes(self, dirname):
        bytes = 0
        for dirname, subdirs, basenames in os.walk(dirname):
            for filename in [os.path.join(dirname, x) for x in basenames]:
                if os.path.isfile(filename):
                    bytes += os.path.getsize(filename)
        return bytes

    def measure(self, func, nth_gen, new_data, **kwargs):
        logging.info('Measuring %s gen %d' % (func.__name__, nth_gen))
        print 'Measuring %s generation %s' % (func.__name__, nth_gen)
        if self.settings['drop-caches']:
            drop_caches()
        measurement = func(nth_gen, **kwargs)
        measurement.new_data = new_data
        measurement.repo_size_after = self.disk_usage(self.repo)
        self.report.add_measurement(func.__name__, nth_gen, measurement)

    def cleanup(self):
        if os.path.exists(self.tempdir):
            logging.info('Removing temporary directory %s' % self.tempdir)
            shutil.rmtree(self.tempdir)

    def disk_usage(self, pathname):
        '''Simulate du(1). Return disk usage in bytes.'''
        def getsize(filename):
            return os.lstat(filename).st_blocks * 512
        bytes = 0
        for dirname, subdirs, basenames in os.walk(pathname):
            bytes += getsize(dirname)
            bytes += sum(getsize(os.path.join(dirname, x)) for x in basenames)
        return bytes


if __name__ == '__main__':
    Seivot(version=__version__).run()

