mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2024-12-15 06:09:57 +08:00
240 lines
8.4 KiB
Perl
240 lines
8.4 KiB
Perl
#!/usr/bin/env perl
|
||
use warnings; #sed replacement for -w perl parameter
|
||
|
||
use Cwd;
|
||
use File::Basename;
|
||
use Env qw(KFJ_RUN_ID);
|
||
use Fcntl qw(:flock);
|
||
|
||
# This program is like run.pl except rather than just running on a local
|
||
# machine, it can be configured to run on remote machines via ssh.
|
||
# It requires that you have set up passwordless access to those machines,
|
||
# and that Kaldi is running from a location that is accessible via the
|
||
# same path on those machines (presumably via an NFS mount).
|
||
#
|
||
# It looks for a file .queue/machines that should have, on each line, the name
|
||
# of a machine that you can ssh to (which may include this machine). It doesn't
|
||
# have to be a fully qualified name.
|
||
#
|
||
# Later we may extend this so that on each line of .queue/machines you
|
||
# can specify various resources that each machine has, such as how
|
||
# many slots and how much memory, and make it wait if machines are
|
||
# busy. But for now it simply ssh's to a machine from those in the list.
|
||
|
||
# The command-line interface of this program is the same as run.pl;
|
||
# see run.pl for more information about the usage.
|
||
|
||
|
||
@ARGV < 2 && die "usage: k8s.pl log-file command-line arguments...";
|
||
# print ("-----------------------\n");
|
||
# print ("ARGV: @ARGV\n");
|
||
|
||
$jobstart = 1;
|
||
$jobend = 1;
|
||
$qsub_opts=""; # These will be ignored.
|
||
|
||
# First parse an option like JOB=1:4, and any
|
||
# options that would normally be given to
|
||
# ssh.pl, which we will just discard.
|
||
|
||
if (@ARGV > 0) {
|
||
while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options
|
||
# that would normally go to qsub, but which will be ignored here.
|
||
$switch = shift @ARGV;
|
||
if ($switch eq "-V") {
|
||
$qsub_opts .= "-V ";
|
||
} else {
|
||
$option = shift @ARGV;
|
||
if ($switch eq "-sync" && $option =~ m/^[yY]/) {
|
||
$qsub_opts .= "-sync "; # Note: in the
|
||
# corresponding code in queue.pl it says instead, just "$sync = 1;".
|
||
}
|
||
$qsub_opts .= "$switch $option ";
|
||
if ($switch eq "-pe") { # e.g. -pe smp 5
|
||
$option2 = shift @ARGV;
|
||
$qsub_opts .= "$option2 ";
|
||
}
|
||
}
|
||
}
|
||
if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
|
||
$jobname = $1;
|
||
$jobstart = $2;
|
||
$jobend = $3;
|
||
shift;
|
||
if ($jobstart > $jobend) {
|
||
die "run.pl: invalid job range $ARGV[0]";
|
||
}
|
||
if ($jobstart <= 0) {
|
||
die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)";
|
||
}
|
||
} elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
|
||
$jobname = $1;
|
||
$jobstart = $2;
|
||
$jobend = $2;
|
||
shift;
|
||
} elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
|
||
print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n";
|
||
}
|
||
}
|
||
|
||
#if ($qsub_opts ne "") {
|
||
# print STDERR "Warning: k8s.pl ignoring options \"$qsub_opts\"\n";
|
||
#}
|
||
|
||
{ # Read .queue/machines
|
||
if (!open(Q, "<run_$ENV{KFJ_RUN_ID}/machines")) {
|
||
print STDERR "k8s.pl: expected the file run_$ENV{KFJ_RUN_ID}/machines to exist.\n";
|
||
exit(1);
|
||
}
|
||
@machines = ();
|
||
while (<Q>) {
|
||
chop;
|
||
if ($_ ne "") {
|
||
@A = split;
|
||
if (@A != 1) {
|
||
die "k8s.pl: bad line '$_' in .queue/machines.";
|
||
}
|
||
if ($A[0] !~ m/^[a-z0-9\.\-]+/) {
|
||
die "k8s.pl: invalid machine name '$A[0]'";
|
||
}
|
||
push @machines, $A[0];
|
||
}
|
||
}
|
||
if (@machines == 0) { die "k8s.pl: no machines listed in .queue/machines"; }
|
||
}
|
||
# print ("machines: @machines\n");
|
||
|
||
$logfile = shift @ARGV;
|
||
|
||
if (defined $jobname && $logfile !~ m/$jobname/ &&
|
||
$jobend > $jobstart) {
|
||
print STDERR "k8s.pl: you are trying to run a parallel job but "
|
||
. "you are putting the output into just one log file ($logfile)\n";
|
||
exit(1);
|
||
}
|
||
|
||
{
|
||
$offset = 0; # $offset will be an offset added to any index from the job-id
|
||
# specified if the user does JOB=1:10. The main point of this is
|
||
# that there are instances where a script will manually submit a
|
||
# number of jobs to the queue, e.g. with log files foo.1.log,
|
||
# foo.2.log and so on, and we don't want all of these to go
|
||
# to the first machine.
|
||
@A = split(/\./, basename($logfile));
|
||
# if $logfile looks like foo.9.log, add 9 to $offset.
|
||
foreach $a (@A) { if ($a =~ m/^\d+$/) { $offset += $a; } }
|
||
}
|
||
# print ("offset: $offset\n");
|
||
|
||
$cmd = "";
|
||
|
||
foreach $x (@ARGV) {
|
||
if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
|
||
elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
|
||
else { $cmd .= "\"$x\" "; }
|
||
}
|
||
|
||
|
||
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
|
||
$childpid = fork();
|
||
if (!defined $childpid) { die "Error forking in k8s.pl (writing to $logfile)"; }
|
||
if ($childpid == 0) {
|
||
# We're in the child... this branch executes the job and returns (possibly
|
||
# with an error status).
|
||
if (defined $jobname) {
|
||
$cmd =~ s/$jobname/$jobid/g;
|
||
$logfile =~ s/$jobname/$jobid/g;
|
||
}
|
||
{ # work out the machine to ssh to.
|
||
$local_offset = $offset + $jobid - 1; # subtract 1 since jobs never start
|
||
# from 0; we'd like the first job
|
||
# to normally run on the first
|
||
# machine.
|
||
# print ("local_offset: $local_offset\n");
|
||
$num_machines = scalar @machines;
|
||
# in the next line, the "+ $num_machines" is in case $local_offset is
|
||
# negative, to ensure the modulus is calculated in the mathematical way, not
|
||
# in the C way where (negative number % positive number) is negative.
|
||
$machines_index = ($local_offset + $num_machines) % $num_machines;
|
||
# print ("machines_index: $machines_index\n");
|
||
$machine = $machines[$machines_index];
|
||
}
|
||
# 增加一个锁逻辑,防止同时调起k8s.pl,导致显存OOM
|
||
if($qsub_opts =~ /gpu/) {
|
||
open $fh, '>', "run_$ENV{KFJ_RUN_ID}/$machine.lock";
|
||
until(flock $fh, LOCK_EX | LOCK_NB) {
|
||
# print "wait...\n";
|
||
sleep 10;
|
||
}
|
||
# print "got\n";
|
||
sleep 1;
|
||
flock $fh, LOCK_UN;
|
||
}
|
||
|
||
if (!open(S, "|ssh $machine bash")) {
|
||
print STDERR "k8s.pl failed to ssh to $machine";
|
||
exit(1); # exits from the forked process within ssh.pl.
|
||
}
|
||
$cwd = getcwd();
|
||
$logdir = dirname($logfile);
|
||
# Below, we're printing into ssh which has opened a bash session; these are
|
||
# bash commands.
|
||
print S "set -e\n"; # if any of the later commands fails, we want it to exit.
|
||
print S "cd $cwd\n";
|
||
print S ". ./path.sh\n";
|
||
print S "mkdir -p $logdir\n";
|
||
print S "time1=\`date +\"%s\"\`\n";
|
||
print S "( echo '#' Running on \`hostname\`\n";
|
||
print S " echo '#' Started at \`date\`\n";
|
||
print S " echo -n '# '; cat <<EOF\n";
|
||
print S "$cmd\n";
|
||
print S "EOF\n";
|
||
print S ") >$logfile\n";
|
||
print S "set +e\n"; # we don't want bash to exit if the next line fails.
|
||
# in the next line, || true means allow this one to fail and not have bash exit immediately.
|
||
print S " ( $cmd ) 2>>$logfile >>$logfile\n";
|
||
print S "ret=\$?\n";
|
||
print S "set -e\n"; # back into mode where it will exit on error.
|
||
print S "time2=\`date +\"%s\"\`\n";
|
||
print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n";
|
||
print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
|
||
print S "exit \$ret"; # return with the status the command exited with.
|
||
$ret = close(S);
|
||
$ssh_return_status = $?;
|
||
# see http://perldoc.perl.org/functions/close.html for explanation of return
|
||
# status of close() and the variables it sets.
|
||
if (! $ret && $! != 0) { die "k8s.pl: unexpected problem ssh'ing to machine $machine"; }
|
||
if ($ssh_return_status != 0) { exit(1); } # exit with error status from this forked process.
|
||
else { exit(0); } # else exit with non-error status.
|
||
}
|
||
}
|
||
|
||
$ret = 0;
|
||
$numfail = 0;
|
||
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
|
||
$r = wait();
|
||
if ($r == -1) { die "Error waiting for child process"; } # should never happen.
|
||
if ($? != 0) { $numfail++; $ret = 1; } # The child process failed.
|
||
}
|
||
|
||
if ($ret != 0) {
|
||
$njobs = $jobend - $jobstart + 1;
|
||
if ($njobs == 1) {
|
||
if (defined $jobname) {
|
||
$logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
|
||
# that job.
|
||
}
|
||
print STDERR "k8s.pl: job failed, log is in $logfile\n";
|
||
if ($logfile =~ m/JOB/) {
|
||
print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
|
||
}
|
||
}
|
||
else {
|
||
$logfile =~ s/$jobname/*/g;
|
||
print STDERR "k8s.pl: $numfail / $njobs failed, log is in $logfile\n";
|
||
}
|
||
}
|
||
|
||
|
||
exit ($ret);
|