#!/usr/bin/perl
# =======================================================================================
#
#      Filename:  likwid-mpirun
#
#      Description:  Wrapper application to mpi startup mechanisms. Builds on
#                    likwid to control affinity and has integrated perfctr support.
#
#      Version:   <VERSION>
#      Released:  <DATE>
#
#      Author:  Jan Treibig (jt), jan.treibig@gmail.com
#      Project:  likwid
#
#      Copyright (C) 2014 Jan Treibig
#
#      This program is free software: you can redistribute it and/or modify it under
#      the terms of the GNU General Public License as published by the Free Software
#      Foundation, either version 3 of the License, or (at your option) any later
#      version.
#
#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
#      You should have received a copy of the GNU General Public License along with
#      this program.  If not, see <http://www.gnu.org/licenses/>.
#
# =======================================================================================

use Getopt::Long;
##############################
#       CONFIGURATION        #
##############################
my $LIKWIDPIN  = '<PREFIX>/bin/likwid-pin';
my $LIKWIDPERF = '<PREFIX>/bin/likwid-perfctr';
my $MPIROOT_openmpi  =  $ENV{'MPIHOME'};
my $MPIROOT_intelmpi =  $ENV{'MPIHOME'};
my $MPIEXEC_openmpi  = "$MPIROOT_openmpi/bin/mpiexec";
my $MPIEXEC_intelmpi = "$MPIROOT_intelmpi/bin/mpiexec";
my $MPIEXEC_mvapich2 = "mpirun";
##############################

my $OMPType = '';
my $MPIType = '';
my $WrapperScript = "mpiexec.$$";
my %Domains;
my $NP = 0;
my $PPN = 0;
my $NperNode = 0;
my %NodeList;
my $NumberOfNodes = 0;
my $NumberOfUsedNodes = 0;
my $Hostfilename = 0;
my $Hostfile = '';
my $PerformanceGroup = '';
my $LikwidCall = "$LIKWIDPIN -c ";
my $debug = 0;
my $marker = '';

sub readHostfile
{
    open FILE, "<$Hostfilename";

    while (<FILE>) {
        chomp;
        if (not exists $NodeList{$host}) {
            $NodeList{$_} = 1;
        }
    }
    close FILE;

    $NumberOfNodes = keys %NodeList;
}

# MPI implementations
# OpenMPI  #<# 
sub generateNodelist_openmpi
{
    open FILE, ">$Hostfilename-openmpi";

    #FIXME  Order may be different
    foreach my $node (keys %NodeList) {
        print FILE "$node slots=$PPN\n"
    }

    close FILE;

    $Hostfile = "-hostfile $Hostfilename-openmpi";
}

sub setEnvironment_openmpi
{
}

sub executeMPI_openmpi
{
    if ($debug) {
        print "$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
    }

    system ("$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript");
}
#>#

# mvapich2  #<# 
sub generateNodelist_mvapich2
{
}

sub setEnvironment_mvapich2
{
    $ENV{'MV2_ENABLE_AFFINITY'}='0';
}

#tw
#mvapich2: pinning aus
# Hybrid programming options:
#    -ranks-per-proc                  assign so many ranks to each process
#
#  Processor topology options:
#    -binding                         process-to-core binding mode
#    -topolib                         processor topology library ( hwloc plpa)

sub executeMPI_mvapich2
{    
    if ($debug) {
        print "$MPIEXEC_mvapich2 $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
    }

    system ("$MPIEXEC_mvapich2 $Hostfile -np $NP -ppn $NperNode ./$WrapperScript");

}

#generate wrapper script
#mpirank
#mpitype = mvapich

#>#

# Intel MPI  #<# 
sub generateNodelist_intelmpi
{
    open FILE, ">$Hostfilename-intelmpi";

    #FIXME  Order may be different
    foreach my $node (keys %NodeList) {
        print FILE "$node\:$NperNode\n"
    }

    close FILE;

    $Hostfile = "-f $Hostfilename-intelmpi";
}

sub setEnvironment_intelmpi
{
    $ENV{'I_MPI_PIN'}='off';
    $ENV{'KMP_AFFINITY'}='disabled';
}

sub executeMPI_intelmpi
{
    if ($debug) {
        print "$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile \n";
        print "$MPIROOT_intelmpi/bin/mpiexec -np $NP $WrapperScript \n";
        print "$MPIROOT_intelmpi/bin/mpdallexit \n";
    }

    system ("$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile ");
    system ("$MPIROOT_intelmpi/bin/mpiexec  -perhost $NperNode -np $NP ./$WrapperScript");
    system ("$MPIROOT_intelmpi/bin/mpdallexit");
}
#>#

sub generateHostlist  #<# 
{
    $ppnHost = '';
    open FILE, "<$ENV{'PBS_NODEFILE'}";
    my @hostArray = <FILE>;
    close FILE;

    $ppnhost = $hostArray[0];
    chomp $ppnhost;

    # generate unique host list
    foreach my $host (@hostArray) {
        chomp $host;
        if ($ppnhost eq $host) {
            $PPN++;
        }
        if (not exists $NodeList{$host}) {
            $NodeList{$host} = 1;
        }
    }

    $NumberOfNodes = keys %NodeList;
}
#>#

sub usage  #<# 
{
    print <<END;
usage: $0 -np <NUMPROC>

Required:
-np <NUMPROC> : number of MPI processes

Optional:
-h                     : this (help) message
-d                     : debug run
-hostfile <argument>   : Specify nodes if not in in a scheduler
-nperdomain <argument> : Run specified number of processes per domain.
                         Supported domains are:
                         N Node
                         S Socket
                         C last level cache group
                         M NUMA domain
-pin <argument>        : Specify pinning for hybrid execution.
                         Processes are separated by underscore.
                         The threaded pinning must be a valid likwid-pin list.
-omp <argument>        : Enables support for specific hybrid setup. Use only 
                         together with -pin option. Currently recognized values: intel
-mpi <argument>        : Specify which mpi implementation should be used. Current recognized 
                         values: intelmpi, openmpi, mvapich2
--                     : Stop the likwid-mpirun parser. Useful for saving options to
                         the MPI application.

You can either use -nperdomain OR -pin for specifying pinning.
For pure MPI pinning use only the nperdomain option. For hybrid use the pin option.

Example: 
$0 -np 32 ./a.out

$0 will use as many processes per node as available in ppn 

Example with pinning:
$0 -np 32 -nperdomain S:2 ./a.out
starts 2 processes per socket.

Example for hybrid run:
$0 -np 32 -pin M0:0-3_M1:0-3
starts 2 processes per node. Threads of first process are pinned to first four
cores in NUMA domain 0. Threads of second process are pinned to first four cores 
in NUMA domain 1.
END

exit(0);
}
#>#

sub generateDomains  #<# 
{
    my $output = `$LIKWIDPIN -p`;

    foreach my $line (split("\n",$output)) {
        if ($line =~ /Tag ([NSCM])[0-9]*: ([0-9 ]+)/) {
            if (exists $Domains{$1}) {
                $Domains{$1}++;
            } else {
                $Domains{$1} = 1;
            }

            if ($1 eq 'N') {
                $PPN =  split(/ /,$2);
            }
        }
    }
}
#>#

sub generateWrapperScript  #<# 
{
    my $pinStrings = shift;
    my $mpiType = shift;
    open FILE, ">$WrapperScript";
    my $environment = '';
    my $doRest = '';

    if ($mpiType eq 'openmpi') {
        $environment = 'OMPI_COMM_WORLD_RANK';
    } elsif ($mpiType eq 'intelmpi') {
        $environment = 'PMI_RANK';
    } elsif ($mpiType eq 'mvapich2') {
        $environment = 'PMI_RANK'; #tw maybe????
    } 

    if ($NP % $NperNode) {
        my $rest = $NP-($NP % $NperNode);
        $doRest = "if (\$myRank >= $rest) {\$localId = \$myRank - $rest;}\n";
    }

    print FILE <<END;
#!/usr/bin/perl 
use strict;
use warnings;

my \$args = join \@ARGV;
my \$myRank = \$ENV{$environment};

my \$localId = \$myRank \% $NperNode  ;

$doRest

if (\$localId == 0) {
    system ("$LikwidCall $pinStrings->[0] $PerformanceGroup $OMPType  $cmdline \$args ");
} 
END

    foreach my $process ( 1 .. ($NperNode-1) ) {
    print FILE <<END;
elsif (\$localId == $process) {
    system ("$LikwidCall $pinStrings->[$process] $PerformanceGroup $OMPType  $cmdline \$args ");
} 
END
    }

    close FILE;
}
#>#

my $pinString = '';
my $domain = '';
my @pinStrings;

GetOptions ('np=i'         => \$NP,
            'nperdomain=s' => \$NperDomain,
            'hostfile=s'   => \$Hostfilename,
            'pin=s'        => \$pinString,
            'mpi=s'        => \$MPIType,
            'omp=s'        => \$OMPType,
            'perf=s'       => \$PerformanceGroup,
            'debug'        => \$debug,
            'marker'       => sub { $marker = ' -m '; },
            'help'         => \&usage);

# MPI implementation switch
$generateNodelist = "generateNodelist_$MPIType";
$setEnvironment = "setEnvironment_$MPIType";
$executeMPI = "executeMPI_$MPIType";

generateDomains();

# check for PBS batch system
if (not defined ($ENV{'PBS_JOBID'})) {
    readHostfile();
} else {
    $NumberOfNodes = `uniq \$PBS_NODEFILE | wc -l`;
}

if ($pinString) {
    @pinStrings = split('_',$pinString);
    $NperNode = ($#pinStrings+1);

    if ($MPIType eq 'openmpi') {
        if ($OMPType eq 'intel') {
            $OMPType = '';
            $OMPType = '-s 0xF';
        }
    } elsif ($MPIType eq 'intelmpi') {
        if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
            $OMPType = '-t intel';
        } elsif ($OMPType eq 'intel') {
            $OMPType = '-s 0x7';
        }
    }elsif ($MPIType eq 'mvapich2') {
        if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
            $OMPType = '-t intel';
        } elsif ($OMPType eq 'intel') {
            $OMPType = '-s 0x7';
        }
    }

} elsif ($NperDomain) {

    $OMPType = '';
    if ($NperDomain =~ /([NSCM]):([0-9]+)/) {
        $domain = $1;
        $NperDomain = $2;
    } else {
        die "Parse Error \n";
    }

    $NperNode = $NperDomain * $Domains{$domain};

    if (not $domain eq 'N') {
        foreach my $currentDomain ( 0 .. ($Domains{$domain}-1)) {
            foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
                push @pinStrings, "$domain"."$currentDomain".":$currentProcess";
            }
        }
    } else {
        foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
            push @pinStrings, "$domain".":$currentProcess";
        }
    }
} elsif ($NP) {
    print "PPN = $PPN\n";
    $NperNode = $PPN;
    $OMPType = '';

    foreach my $currentProcess ( 0 .. ($PPN-1)) {
        push @pinStrings, "N".":$currentProcess";
    }
} else {
    usage();
}

if (not defined ($ENV{'PBS_JOBID'})) {
    $Hostfilename .= $$;
    &{$generateNodelist}();
} else {
    if ($MPIType eq 'intelmpi') {
        $Hostfilename = "pbshosts$$";
        generateHostlist();
        &{$generateNodelist}();
    }
}

map {$cmdline .= "$_ " ;}  @ARGV;
$NumberOfUsedNodes = $NP / $NperNode;

if ($NumberOfUsedNodes > $NumberOfNodes) {
    die "ERROR: Require $NumberOfUsedNodes nodes, but only $NumberOfNodes available!";
}

if ($NumberOfUsedNodes < 1) {
    die "ERROR: Requested $NperNode processes per Node with only $NP total processes!";
}

if ($PerformanceGroup) {
    $LikwidCall = "$LIKWIDPERF -C";
    $PerformanceGroup  = ' -g '.$PerformanceGroup ;
    $PerformanceGroup .= " $marker -o perf_%h_%r.txt ";
} else {
    $PerformanceGroup  = ' -q ';
}

generateWrapperScript(\@pinStrings,$MPIType);
chmod 0755,$WrapperScript;
&{$setEnvironment}();

if ($debug) {
    print  "Number of nodes: $NumberOfNodes \n";
    $NumberOfUsedNodes = $NP / $NperNode;
    print  "Number of used nodes: $NumberOfUsedNodes \n";
    print  "Number of processes per node: $NperNode \n";
}
&{$executeMPI}();

if (-e $WrapperScript and not $debug) {
    unlink ($WrapperScript);
    unlink ($Hostfilename);
}

# vim: foldmethod=marker foldmarker=#<#,#>#
