lconf 119.26 KiB
#!/usr/bin/env python
#
# Copyright (C) 2002-2003 Cluster File Systems, Inc.
# Authors: Robert Read <rread@clusterfs.com>
# Mike Shaver <shaver@clusterfs.com>
# This file is part of Lustre, http://www.lustre.org.
#
# Lustre is free software; you can redistribute it and/or
# modify it under the terms of version 2 of the GNU General Public
# License as published by the Free Software Foundation.
#
# Lustre is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Lustre; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# lconf - lustre configuration tool
#
# lconf is the main driver script for starting and stopping
# lustre filesystem services.
#
# Based in part on the XML obdctl modifications done by Brian Behlendorf
import sys, getopt, types
import string, os, stat, popen2, socket, time, random, fcntl, select
import re, exceptions, signal, traceback
import xml.dom.minidom
if sys.version[0] == '1':
from FCNTL import F_GETFL, F_SETFL
else:
from fcntl import F_GETFL, F_SETFL
PYMOD_DIR = "/usr/lib/lustre/python"
def development_mode():
base = os.path.dirname(sys.argv[0])
if os.access(base+"/Makefile", os.R_OK):
return 1
return 0
if development_mode():
sys.path.append('../utils')
else:
sys.path.append(PYMOD_DIR)
import Lustre
# Global parameters
MAXTCPBUF = 16777216
DEFAULT_TCPBUF = 8388608
DEFAULT_PORT = 988
#
# Maximum number of devices to search for.
# (the /dev/loop* nodes need to be created beforehand)
MAX_LOOP_DEVICES = 256
PORTALS_DIR = '../portals'
# Needed to call lconf --record
CONFIG_FILE = ""
# Please keep these in sync with the values in portals/kp30.h
ptldebug_names = {
"trace" : (1 << 0),
"inode" : (1 << 1),
"super" : (1 << 2),
"ext2" : (1 << 3),
"malloc" : (1 << 4),
"cache" : (1 << 5),
"info" : (1 << 6),
"ioctl" : (1 << 7),
"blocks" : (1 << 8),
"net" : (1 << 9),
"warning" : (1 << 10),
"buffs" : (1 << 11),
"other" : (1 << 12),
"dentry" : (1 << 13),
"portals" : (1 << 14),
"page" : (1 << 15),
"dlmtrace" : (1 << 16),
"error" : (1 << 17),
"emerg" : (1 << 18),
"ha" : (1 << 19),
"rpctrace" : (1 << 20),
"vfstrace" : (1 << 21),
"reada" : (1 << 22),
"mmap" : (1 << 23),
"config" : (1 << 24),
}
subsystem_names = {
"undefined" : (1 << 0),
"mdc" : (1 << 1),
"mds" : (1 << 2),
"osc" : (1 << 3),
"ost" : (1 << 4),
"class" : (1 << 5),
"log" : (1 << 6),
"llite" : (1 << 7),
"rpc" : (1 << 8),
"mgmt" : (1 << 9),
"portals" : (1 << 10),
"nal" : (1 << 11),
"pinger" : (1 << 12),
"filter" : (1 << 13),
"ptlbd" : (1 << 14),
"echo" : (1 << 15),
"ldlm" : (1 << 16),
"lov" : (1 << 17),
"ptlrouter" : (1 << 18),
"cobd" : (1 << 19),
"sm" : (1 << 20),
"asobd" : (1 << 21),
"confobd" : (1 << 22),
"lmv" : (1 << 23),
"cmobd" : (1 << 24),
}
first_cleanup_error = 0
def cleanup_error(rc):
global first_cleanup_error
if not first_cleanup_error:
first_cleanup_error = rc
# ============================================================
# debugging and error funcs
def fixme(msg = "this feature"):
raise Lustre.LconfError, msg + ' not implemented yet.'
def panic(*args):
msg = string.join(map(str,args))
if not config.noexec:
raise Lustre.LconfError(msg)
else:
print "! " + msg
def log(*args):
msg = string.join(map(str,args))
print msg
def logall(msgs):
for s in msgs:
print string.strip(s)
def debug(*args):
if config.verbose:
msg = string.join(map(str,args))
print msg
# ack, python's builtin int() does not support '0x123' syntax.
# eval can do it, although what a hack!
def my_int(s):
try:
if s[0:2] == '0x':
return eval(s, {}, {})
else:
return int(s)
except SyntaxError, e:
raise ValueError("not a number")
except NameError, e:
raise ValueError("not a number")
# ============================================================
# locally defined exceptions
class CommandError (exceptions.Exception):
def __init__(self, cmd_name, cmd_err, rc=None):
self.cmd_name = cmd_name
self.cmd_err = cmd_err
self.rc = rc
def dump(self):
import types
if type(self.cmd_err) == types.StringType:
if self.rc:
print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
else:
print "! %s: %s" % (self.cmd_name, self.cmd_err)
elif type(self.cmd_err) == types.ListType:
if self.rc:
print "! %s (error %d):" % (self.cmd_name, self.rc)
else:
print "! %s:" % (self.cmd_name)
for s in self.cmd_err:
print "> %s" %(string.strip(s))
else:
print self.cmd_err
# ============================================================
# handle daemons, like the acceptor
class DaemonHandler:
""" Manage starting and stopping a daemon. Assumes daemon manages
it's own pid file. """
def __init__(self, cmd):
self.command = cmd
self.path =""
def start(self):
if self.running():
log(self.command, "already running.")
if not self.path:
self.path = find_prog(self.command)
if not self.path:
panic(self.command, "not found.")
ret, out = runcmd(self.path +' '+ self.command_line())
if ret:
raise CommandError(self.path, out, ret)
def stop(self):
if self.running():
pid = self.read_pidfile()
try:
if pid != 1:
log ("killing process", pid)
os.kill(pid, 15)
else:
log("was unable to find pid of " + self.command)
#time.sleep(1) # let daemon die
except OSError, e:
log("unable to kill", self.command, e)
if self.running():
log("unable to kill", self.command)
def running(self):
pid = self.read_pidfile()
if pid:
try:
if pid != 1:
os.kill(pid, 0)
else:
log("was unable to find pid of " + self.command)
except OSError:
self.clean_pidfile()
else:
return 1
return 0
def read_pidfile(self):
try:
fp = open(self.pidfile(), 'r')
val = fp.read()
if val == '':
val = '1'
pid = int(val)
fp.close()
return pid
except IOError:
return 0
def clean_pidfile(self):
""" Remove a stale pidfile """
log("removing stale pidfile:", self.pidfile())
try:
os.unlink(self.pidfile())
except OSError, e:
log(self.pidfile(), e)
class AcceptorHandler(DaemonHandler):
def __init__(self, port, net_type):
DaemonHandler.__init__(self, "acceptor")
self.port = port
self.flags = ''
def pidfile(self):
return "/var/run/%s-%d.pid" % (self.command, self.port)
def command_line(self):
return string.join(map(str,(self.flags, self.port)))
acceptors = {}
# start the acceptors
def run_acceptors():
if config.lctl_dump or config.record:
return
for port in acceptors.keys():
daemon = acceptors[port]
if not daemon.running():
daemon.start()
def run_one_acceptor(port):
if config.lctl_dump or config.record:
return
if acceptors.has_key(port):
daemon = acceptors[port]
if not daemon.running():
daemon.start()
else:
panic("run_one_acceptor: No acceptor defined for port:", port)
def stop_acceptor(port):
if acceptors.has_key(port):
daemon = acceptors[port]
if daemon.running():
daemon.stop()
# ============================================================
# handle lctl interface
class LCTLInterface:
"""
Manage communication with lctl
"""
def __init__(self, cmd):
"""
Initialize close by finding the lctl binary.
"""
self.lctl = find_prog(cmd)
self.save_file = ''
self.record_device = ''
if not self.lctl:
if config.noexec:
debug('! lctl not found')
self.lctl = 'lctl'
else:
raise CommandError('lctl', "unable to find lctl binary.")
def use_save_file(self, file):
self.save_file = file
def record(self, dev_name, logname):
log("Recording log", logname, "on", dev_name)
self.record_device = dev_name
self.record_log = logname
def end_record(self):
log("End recording log", self.record_log, "on", self.record_device)
self.record_device = None
self.record_log = None
def set_nonblock(self, fd):
fl = fcntl.fcntl(fd, F_GETFL)
fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
def run(self, cmds):
"""
run lctl
the cmds are written to stdin of lctl
lctl doesn't return errors when run in script mode, so
stderr is checked
should modify command line to accept multiple commands, or
create complex command line options
"""
cmd_line = self.lctl
if self.save_file:
cmds = '\n dump ' + self.save_file + '\n' + cmds
elif self.record_device:
cmds = """
device $%s
record %s
%s""" % (self.record_device, self.record_log, cmds)
debug("+", cmd_line, cmds)
if config.noexec: return (0, [])
child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
child.tochild.write(cmds + "\n")
child.tochild.close()
# print "LCTL:", cmds
# From "Python Cookbook" from O'Reilly
outfile = child.fromchild
outfd = outfile.fileno()
self.set_nonblock(outfd)
errfile = child.childerr
errfd = errfile.fileno()
self.set_nonblock(errfd)
outdata = errdata = ''
outeof = erreof = 0
while 1:
ready = select.select([outfd,errfd],[],[]) # Wait for input
if outfd in ready[0]:
outchunk = outfile.read()
if outchunk == '': outeof = 1
outdata = outdata + outchunk
if errfd in ready[0]:
errchunk = errfile.read()
if errchunk == '': erreof = 1
errdata = errdata + errchunk
if outeof and erreof: break
# end of "borrowed" code
ret = child.wait()
if os.WIFEXITED(ret):
rc = os.WEXITSTATUS(ret)
else:
rc = 0
if rc or len(errdata):
raise CommandError(self.lctl, errdata, rc)
return rc, outdata
def runcmd(self, *args):
"""
run lctl using the command line
"""
cmd = string.join(map(str,args))
debug("+", self.lctl, cmd)
rc, out = run(self.lctl, cmd)
if rc:
raise CommandError(self.lctl, out, rc)
return rc, out
def clear_log(self, dev, log):
""" clear an existing log """
cmds = """
device $%s
probe
clear_log %s
quit """ % (dev, log)
self.run(cmds)
def root_squash(self, name, uid, nid):
cmds = """
device $%s
root_squash %s %s
quit""" % (name, uid, nid)
self.run(cmds)
def network(self, net, nid):
""" set mynid """
cmds = """
network %s
mynid %s
quit """ % (net, nid)
self.run(cmds)
# add an interface
def add_interface(self, net, ip, netmask = ""):
""" add an interface """
cmds = """
network %s
add_interface %s %s
quit """ % (net, ip, netmask)
self.run(cmds)
# delete an interface
def del_interface(self, net, ip):
""" delete an interface """
cmds = """
network %s
del_interface %s
quit """ % (net, ip)
self.run(cmds)
# create a new connection
def add_uuid(self, net_type, uuid, nid):
cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
self.run(cmds)
def add_peer(self, net_type, nid, hostaddr, port):
if net_type in ('tcp','ra') and not config.lctl_dump:
cmds = """
network %s
add_peer %s %s %d
quit""" % (net_type,
nid, hostaddr, port )
self.run(cmds)
elif net_type in ('openib','iib',) and not config.lctl_dump:
cmds = """
network %s
add_peer %s
quit""" % (net_type,
nid )
self.run(cmds)
def connect(self, srv):
self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
if srv.net_type in ('tcp','openib','iib','ra') and not config.lctl_dump:
if srv.hostaddr[0]:
hostaddr = string.split(srv.hostaddr[0], '/')[0]
self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
# Recover a device
def recover(self, dev_name, new_conn):
cmds = """
device $%s
recover %s""" %(dev_name, new_conn)
self.run(cmds)
# add a route to a range
def add_route(self, net, gw, lo, hi):
cmds = """
network %s
add_route %s %s %s
quit """ % (net,
gw, lo, hi)
try:
self.run(cmds)
except CommandError, e:
log ("ignore: ")
e.dump()
def del_route(self, net, gw, lo, hi):
cmds = """
ignore_errors
network %s
del_route %s %s %s
quit """ % (net, gw, lo, hi)
self.run(cmds)
# add a route to a host
def add_route_host(self, net, uuid, gw, tgt):
self.add_uuid(net, uuid, tgt)
cmds = """
network %s
add_route %s %s
quit """ % (net,
gw, tgt)
try:
self.run(cmds)
except CommandError, e:
log ("ignore: ")
e.dump()
# add a route to a range
def del_route_host(self, net, uuid, gw, tgt):
self.del_uuid(uuid)
cmds = """
ignore_errors
network %s
del_route %s %s
quit """ % (net, gw, tgt)
self.run(cmds)
def del_peer(self, net_type, nid, hostaddr):
if net_type in ('tcp',) and not config.lctl_dump:
cmds = """
ignore_errors
network %s
del_peer %s %s single_share
quit""" % (net_type,
nid, hostaddr)
self.run(cmds)
elif net_type in ('openib','iib','ra') and not config.lctl_dump:
cmds = """
ignore_errors
network %s
del_peer %s single_share
quit""" % (net_type,
nid)
self.run(cmds)
# disconnect one connection
def disconnect(self, srv):
self.del_uuid(srv.nid_uuid)
if srv.net_type in ('tcp','openib','iib','ra') and not config.lctl_dump:
if srv.hostaddr[0]:
hostaddr = string.split(srv.hostaddr[0], '/')[0]
self.del_peer(srv.net_type, srv.nid, hostaddr)
def del_uuid(self, uuid):
cmds = """
ignore_errors
del_uuid %s
quit""" % (uuid,)
self.run(cmds)
# disconnect all
def disconnectAll(self, net):
cmds = """
ignore_errors
network %s
disconnect
quit""" % (net)
self.run(cmds)
def attach(self, type, name, uuid):
cmds = """
attach %s %s %s
quit""" % (type, name, uuid)
self.run(cmds)
def setup(self, name, setup = ""):
cmds = """
cfg_device %s
setup %s
quit""" % (name, setup)
self.run(cmds)
def add_conn(self, name, conn_uuid):
cmds = """
cfg_device %s
add_conn %s
quit""" % (name, conn_uuid)
self.run(cmds)
# create a new device with lctl
def newdev(self, type, name, uuid, setup = ""):
self.attach(type, name, uuid);
try:
self.setup(name, setup)
except CommandError, e:
self.cleanup(name, uuid, 0)
raise e
# cleanup a device
def cleanup(self, name, uuid, force, failover = 0):
if failover: force = 1
cmds = """
ignore_errors
cfg_device $%s
cleanup %s %s
detach
quit""" % (name, ('', 'force')[force],
('', 'failover')[failover])
self.run(cmds)
# create an lov
def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
stripe_sz, stripe_off, pattern, devlist = None):
cmds = """
attach lov %s %s
lov_setup %s %d %d %d %s %s
quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
pattern, devlist)
self.run(cmds)
# add an OBD to a LOV
def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
cmds = """
lov_modify_tgts add %s %s %s %s
quit""" % (name, obd_uuid, index, gen)
self.run(cmds)
# create an lmv
def lmv_setup(self, name, uuid, desc_uuid, devlist):
cmds = """
attach lmv %s %s
lmv_setup %s %s
quit""" % (name, uuid, desc_uuid, devlist)
self.run(cmds)
# delete an OBD from a LOV
def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
cmds = """
lov_modify_tgts del %s %s %s %s
quit""" % (name, obd_uuid, index, gen)
self.run(cmds)
# deactivate an OBD
def deactivate(self, name):
cmds = """
device $%s
deactivate
quit""" % (name)
self.run(cmds)
# dump the log file
def dump(self, dump_file):
cmds = """
debug_kernel %s 1
quit""" % (dump_file)
self.run(cmds)
# get list of devices
def device_list(self):
devices = '/proc/fs/lustre/devices'
ret = []
if os.access(devices, os.R_OK):
try:
fp = open(devices, 'r')
ret = fp.readlines()
fp.close()
except IOError, e:
log(e)
return ret
# get lustre version
def lustre_version(self):
rc, out = self.runcmd('version')
return out
# dump mount options
def mount_option(self, profile, osc, mdc):
cmds = """
mount_option %s %s %s
quit""" % (profile, osc, mdc)
self.run(cmds)
# delete mount options
def del_mount_option(self, profile):
cmds = """
del_mount_option %s
quit""" % (profile,)
self.run(cmds)
def set_timeout(self, timeout):
cmds = """
set_timeout %s
quit""" % (timeout,)
self.run(cmds)
def set_lustre_upcall(self, upcall):
cmds = """
set_lustre_upcall %s
quit""" % (upcall,)
self.run(cmds)
# ============================================================
# Various system-level functions
# (ideally moved to their own module)
# Run a command and return the output and status.
# stderr is sent to /dev/null, could use popen3 to
# save it if necessary
def runcmd(cmd):
debug ("+", cmd)
if config.noexec: return (0, [])
f = os.popen(cmd + ' 2>&1')
out = f.readlines()
ret = f.close()
if ret:
ret = ret >> 8
else:
ret = 0
return (ret, out)
def run(*args):
cmd = string.join(map(str,args))
return runcmd(cmd)
# Run a command in the background.
def run_daemon(*args):
cmd = string.join(map(str,args))
debug ("+", cmd)
if config.noexec: return 0
f = os.popen(cmd + ' 2>&1')
ret = f.close()
if ret:
ret = ret >> 8
else:
ret = 0
return ret
# Determine full path to use for an external command
# searches dirname(argv[0]) first, then PATH
def find_prog(cmd):
syspath = string.split(os.environ['PATH'], ':')
cmdpath = os.path.dirname(sys.argv[0])
syspath.insert(0, cmdpath);
if config.portals:
syspath.insert(0, os.path.join(config.portals, 'utils/'))
for d in syspath:
prog = os.path.join(d,cmd)
if os.access(prog, os.X_OK):
return prog
return ''
# Recursively look for file starting at base dir
def do_find_file(base, mod):
fullname = os.path.join(base, mod)
if os.access(fullname, os.R_OK):
return fullname
for d in os.listdir(base):
dir = os.path.join(base,d)
if os.path.isdir(dir):
module = do_find_file(dir, mod)
if module:
return module
# is the path a block device?
def is_block(path):
s = ()
try:
s = os.stat(path)
except OSError:
return 0
return stat.S_ISBLK(s[stat.ST_MODE])
# find the journal device from mkfs options
def jdev(opts):
if opts == None:
return ''
x=string.split(opts)
i=0
while i < len(x) - 1:
if x[i] == '-J' and x[i+1].startswith('device='):
str=x[i+1]
return str[7:]
i=i+1
return ''
# build fs according to type
# fixme: dangerous
def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
block_cnt = ''
jopt = ''
iopt = ''
if devsize:
if devsize < 8000:
panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
(dev, devsize))
# devsize is in 1k, and fs block count is in 4k
block_cnt = devsize/4
if fstype in ('ext3', 'extN', 'ldiskfs'):
# ext3 journal size is in megabytes
# but don't set jsize if mkfsoptions indicates a separate journal device
if jsize == 0 and jdev(mkfsoptions) == '':
if devsize == 0:
if not is_block(dev):
ret, out = runcmd("ls -l %s" %dev)
devsize = int(string.split(out[0])[4]) / 1024
else:
# sfdisk works for symlink, hardlink, and realdev
ret, out = runcmd("sfdisk -s %s" %dev)
if not ret:
devsize = int(out[0])
else:
# sfdisk -s will fail for too large block device,
# then, read the size of partition from /proc/partitions
# get the realpath of the device
# it may be the real device, such as /dev/hda7
# or the hardlink created via mknod for a device
if 'realpath' in dir(os.path):
real_dev = os.path.realpath(dev)
else:
real_dev = dev
link_count = 0
while os.path.islink(real_dev) and (link_count < 20):
link_count = link_count + 1
dev_link = os.readlink(real_dev)
if os.path.isabs(dev_link):
real_dev = dev_link
else:
real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
if link_count > 19:
panic("Entountered too many symbolic links resolving block device:", dev)
# get the major and minor number of the realpath via ls
# it seems python(os.stat) does not return
# the st_rdev member of the stat structure
ret, out = runcmd("ls -l %s" %real_dev)
major = string.split(string.split(out[0])[4], ",")[0]
minor = string.split(out[0])[5]
# get the devsize from /proc/partitions with the major and minor number
ret, out = runcmd("cat /proc/partitions")
for line in out:
if len(line) > 1:
if string.split(line)[0] == major and string.split(line)[1] == minor:
devsize = int(string.split(line)[2])
break
if devsize > 1024 * 1024:
jsize = ((devsize / 102400) * 4)
if jsize > 400:
jsize = 400
if jsize: jopt = "-J size=%d" %(jsize,)
if isize: iopt = "-I %d" %(isize,)
mkfs = 'mkfs.ext2 -j -b 4096 '
if not isblock or config.force:
mkfs = mkfs + ' -F '
if jdev(mkfsoptions) != '':
jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
if config.force:
jmkfs = jmkfs + '-F '
jmkfs = jmkfs + jdev(mkfsoptions)
(ret, out) = run (jmkfs)
if ret:
panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
elif fstype == 'reiserfs':
# reiserfs journal size is in blocks
if jsize: jopt = "--journal_size %d" %(jsize,)
mkfs = 'mkreiserfs -ff'
else:
panic('unsupported fs type: ', fstype)
if config.mkfsoptions != None:
mkfs = mkfs + ' ' + config.mkfsoptions
if mkfsoptions != None:
mkfs = mkfs + ' ' + mkfsoptions
(ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
if ret:
panic("Unable to build fs:", dev, string.join(out))
# enable hash tree indexing on fsswe
if fstype in ('ext3', 'extN', 'ldiskfs'):
htree = 'echo "feature FEATURE_C5" | debugfs -w'
(ret, out) = run (htree, dev)
if ret:
panic("Unable to enable htree:", dev)
# some systems use /dev/loopN, some /dev/loop/N
def loop_base():
import re
loop = '/dev/loop'
if not os.access(loop + str(0), os.R_OK):
loop = loop + '/'
if not os.access(loop + str(0), os.R_OK):
panic ("can't access loop devices")
return loop
# find loop device assigned to the file
def find_assigned_loop(file):
loop = loop_base()
for n in xrange(0, MAX_LOOP_DEVICES):
dev = loop + str(n)
if os.access(dev, os.R_OK):
(stat, out) = run('losetup', dev)
if out and stat == 0:
m = re.search(r'\((.*)\)', out[0])
if m and file == m.group(1):
return dev
else:
break
return ''
# create file if necessary and assign the first free loop device
def init_loop(file, size, fstype, journal_size, inode_size,
mkfsoptions, reformat, autoformat, backfstype, backfile):
if fstype == 'smfs':
realfile = backfile
realfstype = backfstype
if is_block(backfile):
if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
return realfile
else:
realfile = file
realfstype = fstype
dev = find_assigned_loop(realfile)
if dev:
print 'WARNING: file ', realfile, 'already mapped to', dev
return dev
if reformat or not os.access(realfile, os.R_OK | os.W_OK):
if size < 8000:
panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
(ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
if ret:
panic("Unable to create backing store:", realfile)
mkfs(realfile, size, realfstype, journal_size, inode_size,
mkfsoptions, isblock=0)
loop = loop_base()
# find next free loop
for n in xrange(0, MAX_LOOP_DEVICES):
dev = loop + str(n)
if os.access(dev, os.R_OK):
(stat, out) = run('losetup', dev)
if stat:
print "attach " + realfile + " <-> " + dev
run('losetup', dev, realfile)
return dev
else:
print "out of loop devices"
return ''
print "out of loop devices"
return ''
# undo loop assignment
def clean_loop(dev, fstype, backfstype, backdev):
if fstype == 'smfs':
realfile = backdev
else:
realfile = dev
if not is_block(realfile):
dev = find_assigned_loop(realfile)
if dev:
print "detach " + dev + " <-> " + realfile
ret, out = run('losetup -d', dev)
if ret:
log('unable to clean loop device:', dev, 'for file:', realfile)
logall(out)
# finilizes passed device
def clean_dev(dev, fstype, backfstype, backdev):
if fstype == 'smfs' or not is_block(dev):
clean_loop(dev, fstype, backfstype, backdev)
# determine if dev is formatted as a <fstype> filesystem
def need_format(fstype, dev):
# FIXME don't know how to implement this
return 0
# initialize a block device if needed
def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
inode_size, mkfsoptions, backfstype, backdev):
if config.noexec:
return dev
if fstype == 'smfs' or not is_block(dev):
dev = init_loop(dev, size, fstype, journal_size, inode_size,
mkfsoptions, reformat, autoformat, backfstype, backdev)
elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
isblock=0)
# else:
# panic("device:", dev,
# "not prepared, and autoformat is not set.\n",
# "Rerun with --reformat option to format ALL filesystems")
return dev
def if2addr(iface):
"""lookup IP address for an interface"""
rc, out = run("/sbin/ifconfig", iface)
if rc or not out:
return None
addr = string.split(out[1])[1]
ip = string.split(addr, ':')[1]
return ip
def def_mount_options(fstype, target):
"""returns deafult mount options for passed fstype and target (mds, ost)"""
if fstype == 'ext3' or fstype == 'ldiskfs':
mountfsoptions = "errors=remount-ro"
if target == 'ost' and sys_get_branch() == '2.4':
mountfsoptions = "%s,asyncdel" % (mountfsoptions)
return mountfsoptions
return ""
def sys_get_elan_position_file():
procfiles = ["/proc/elan/device0/position",
"/proc/qsnet/elan4/device0/position",
"/proc/qsnet/elan3/device0/position"]
for p in procfiles:
if os.access(p, os.R_OK):
return p
return ""
def sys_get_local_nid(net_type, wildcard, cluster_id):
"""Return the local nid."""
local = ""
if sys_get_elan_position_file():
local = sys_get_local_address('elan', '*', cluster_id)
else:
local = sys_get_local_address(net_type, wildcard, cluster_id)
return local
def sys_get_local_address(net_type, wildcard, cluster_id):
"""Return the local address for the network type."""
local = ""
if net_type in ('tcp','openib','iib','ra'):
if ':' in wildcard:
iface, star = string.split(wildcard, ':')
local = if2addr(iface)
if not local:
panic ("unable to determine ip for:", wildcard)
else:
host = socket.gethostname()
local = socket.gethostbyname(host)
elif net_type == 'elan':
# awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
f = sys_get_elan_position_file()
if not f:
panic ("unable to determine local Elan ID")
try:
fp = open(f, 'r')
lines = fp.readlines()
fp.close()
for l in lines:
a = string.split(l)
if a[0] == 'NodeId':
elan_id = a[1]
break
try:
nid = my_int(cluster_id) + my_int(elan_id)
local = "%d" % (nid)
except ValueError, e:
local = elan_id
except IOError, e:
log(e)
elif net_type == 'lo':
fixme("automatic local address for loopback")
elif net_type == 'gm':
fixme("automatic local address for GM")
return local
def sys_get_branch():
"""Returns kernel release"""
try:
fp = open('/proc/sys/kernel/osrelease')
lines = fp.readlines()
fp.close()
for l in lines:
version = string.split(l)
a = string.split(version[0], '.')
return a[0] + '.' + a[1]
except IOError, e:
log(e)
return ""
# XXX: instead of device_list, ask for $name and see what we get
def is_prepared(name):
"""Return true if a device exists for the name"""
if config.lctl_dump:
return 0
if (config.noexec or config.record) and config.cleanup:
return 1
try:
# expect this format:
# 1 UP ldlm ldlm ldlm_UUID 2
out = lctl.device_list()
for s in out:
if name == string.split(s)[3]:
return 1
except CommandError, e:
e.dump()
return 0
def net_is_prepared():
"""If the any device exists, then assume that all networking
has been configured"""
out = lctl.device_list()
return len(out) > 0
def fs_is_mounted(path):
"""Return true if path is a mounted lustre filesystem"""
try:
fp = open('/proc/mounts')
lines = fp.readlines()
fp.close()
for l in lines:
a = string.split(l)
if a[1] == path and a[2] == 'lustre_lite':
return 1
except IOError, e:
log(e)
return 0
def kmod_find(src_dir, dev_dir, modname):
modbase = src_dir +'/'+ dev_dir +'/'+ modname
for modext in '.ko', '.o':
module = modbase + modext
try:
if os.access(module, os.R_OK):
return module
except OSError:
pass
return None
def kmod_info(modname):
"""Returns reference count for passed module name."""
try:
fp = open('/proc/modules')
lines = fp.readlines()
fp.close()
# please forgive my tired fingers for this one
ret = filter(lambda word, mod = modname: word[0] == mod,
map(lambda line: string.split(line), lines))
if not ret:
return ''
return ret[0]
except Exception, e:
return 0
class kmod:
"""Presents kernel module"""
def __init__(self, src_dir, dev_dir, name):
self.src_dir = src_dir
self.dev_dir = dev_dir
self.name = name
def load(self):
"""Load module"""
log ('loading module:', self.name, 'srcdir',
self.src_dir, 'devdir', self.dev_dir)
if self.src_dir:
module = kmod_find(self.src_dir, self.dev_dir,
self.name)
if not module:
panic('module not found:', self.name)
(rc, out) = run('/sbin/insmod', module)
if rc:
raise CommandError('insmod', out, rc)
else:
(rc, out) = run('/sbin/modprobe', self.name)
if rc:
raise CommandError('modprobe', out, rc)
def cleanup(self):
"""Unload module"""
log('unloading module:', self.name)
(rc, out) = run('/sbin/rmmod', self.name)
if rc:
log('unable to unload module:', self.name +
"(" + self.refcount() + ")")
logall(out)
def info(self):
"""Returns module info if any."""
return kmod_info(self.name)
def loaded(self):
"""Returns 1 if module is loaded. Otherwise 0 is returned."""
if self.info():
return 1
else:
return 0
def refcount(self):
"""Returns module refcount."""
info = self.info()
if not info:
return ''
return info[2]
def used(self):
"""Returns 1 if module is used, otherwise 0 is returned."""
info = self.info()
if not info:
return 0
if len(info) > 3:
users = info[3]
if users and users != '(unused)' and users != '-':
return 1
else:
return 0
else:
return 0
def busy(self):
"""Returns 1 if module is busy, otherwise 0 is returned."""
if self.loaded() and (self.used() or self.refcount() != '0'):
return 1
else:
return 0
class kmod_manager:
"""Manage kernel modules"""
def __init__(self, lustre_dir, portals_dir):
self.lustre_dir = lustre_dir
self.portals_dir = portals_dir
self.kmodule_list = []
def find_module(self, modname):
"""Find module by module name"""
for mod in self.kmodule_list:
if mod.name == modname:
return mod
return ''
def add_portals_module(self, dev_dir, modname):
"""Append a module to list of modules to load."""
mod = self.find_module(modname)
if not mod:
mod = kmod(self.portals_dir, dev_dir, modname)
self.kmodule_list.append(mod)
def add_lustre_module(self, dev_dir, modname):
"""Append a module to list of modules to load."""
mod = self.find_module(modname)
if not mod:
mod = kmod(self.lustre_dir, dev_dir, modname)
self.kmodule_list.append(mod)
def load_modules(self):
"""Load all the modules in the list in the order they appear."""
for mod in self.kmodule_list:
if mod.loaded() and not config.noexec:
continue
mod.load()
def cleanup_modules(self):
"""Unload the modules in the list in reverse order."""
rev = self.kmodule_list
rev.reverse()
for mod in rev:
if (not mod.loaded() or mod.busy()) and not config.noexec:
continue
# debug hack
if mod.name == 'portals' and config.dump:
lctl.dump(config.dump)
mod.cleanup()
# ============================================================
# Classes to prepare and cleanup the various objects
#
class Module:
""" Base class for the rest of the modules. The default cleanup method is
defined here, as well as some utilitiy funcs.
"""
def __init__(self, module_name, db):
self.db = db
self.module_name = module_name
self.name = self.db.getName()
self.uuid = self.db.getUUID()
self._server = None
self._connected = 0
def info(self, *args):
msg = string.join(map(str,args))
print self.module_name + ":", self.name, self.uuid, msg
def cleanup(self):
""" default cleanup, used for most modules """
self.info()
try:
lctl.cleanup(self.name, self.uuid, config.force)
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
def add_module(self, manager):
"""Adds all needed modules in the order they appear."""
return
def safe_to_clean(self):
return 1
def safe_to_clean_modules(self):
return self.safe_to_clean()
class Network(Module):
def __init__(self,db):
Module.__init__(self, 'NETWORK', db)
self.net_type = self.db.get_val('nettype')
self.nid = self.db.get_val('nid', '*')
self.cluster_id = self.db.get_val('clusterid', "0")
self.port = self.db.get_val_int('port', 0)
if '*' in self.nid:
self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
if not self.nid:
panic("unable to set nid for", self.net_type, self.nid, cluster_id)
self.generic_nid = 1
debug("nid:", self.nid)
else:
self.generic_nid = 0
self.nid_uuid = self.nid_to_uuid(self.nid)
self.hostaddr = self.db.get_hostaddr()
if len(self.hostaddr) == 0:
self.hostaddr.append(self.nid)
if '*' in self.hostaddr[0]:
self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
if not self.hostaddr[0]:
panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
debug("hostaddr:", self.hostaddr[0])
def add_module(self, manager):
manager.add_portals_module("libcfs", 'libcfs')
manager.add_portals_module("portals", 'portals')
if node_needs_router():
manager.add_portals_module("router", 'kptlrouter')
if self.net_type == 'tcp':
manager.add_portals_module("knals/socknal", 'ksocknal')
if self.net_type == 'elan':
manager.add_portals_module("knals/qswnal", 'kqswnal')
if self.net_type == 'gm':
manager.add_portals_module("knals/gmnal", 'kgmnal')
if self.net_type == 'openib':
manager.add_portals_module("knals/openibnal", 'kopenibnal')
if self.net_type == 'iib':
manager.add_portals_module("knals/iibnal", 'kiibnal')
if self.net_type == 'lo':
manager.add_portals_module("knals/lonal", 'klonal')
if self.net_type == 'ra':
manager.add_portals_module("knals/ranal", 'kranal')
def nid_to_uuid(self, nid):
return "NID_%s_UUID" %(nid,)
def prepare(self):
if not config.record and net_is_prepared():
return
self.info(self.net_type, self.nid, self.port)
if not (config.record and self.generic_nid):
lctl.network(self.net_type, self.nid)
if self.net_type == 'tcp':
sys_tweak_socknal()
for hostaddr in self.db.get_hostaddr():
ip = string.split(hostaddr, '/')[0]
if len(string.split(hostaddr, '/')) == 2:
netmask = string.split(hostaddr, '/')[1]
else:
netmask = ""
lctl.add_interface(self.net_type, ip, netmask)
if self.net_type == 'elan':
sys_optimize_elan()
if self.port and node_is_router():
run_one_acceptor(self.port)
self.connect_peer_gateways()
def connect_peer_gateways(self):
for router in self.db.lookup_class('node'):
if router.get_val_int('router', 0):
for netuuid in router.get_networks():
net = self.db.lookup(netuuid)
gw = Network(net)
if (gw.cluster_id == self.cluster_id and
gw.net_type == self.net_type):
if gw.nid != self.nid:
lctl.connect(gw)
def disconnect_peer_gateways(self):
for router in self.db.lookup_class('node'):
if router.get_val_int('router', 0):
for netuuid in router.get_networks():
net = self.db.lookup(netuuid)
gw = Network(net)
if (gw.cluster_id == self.cluster_id and
gw.net_type == self.net_type):
if gw.nid != self.nid:
try:
lctl.disconnect(gw)
except CommandError, e:
print "disconnect failed: ", self.name
e.dump()
cleanup_error(e.rc)
def safe_to_clean(self):
return not net_is_prepared()
def cleanup(self):
self.info(self.net_type, self.nid, self.port)
if self.port:
stop_acceptor(self.port)
if node_is_router():
self.disconnect_peer_gateways()
if self.net_type == 'tcp':
for hostaddr in self.db.get_hostaddr():
ip = string.split(hostaddr, '/')[0]
lctl.del_interface(self.net_type, ip)
def correct_level(self, level, op=None):
return level
class RouteTable(Module):
def __init__(self,db):
Module.__init__(self, 'ROUTES', db)
def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
lo, hi):
# only setup connections for tcp, openib, and iib NALs
srvdb = None
if not net_type in ('tcp','openib','iib','ra'):
return None
# connect to target if route is to single node and this node is the gw
if lo == hi and local_interface(net_type, gw_cluster_id, gw):
if not local_cluster(net_type, tgt_cluster_id):
panic("target", lo, " not on the local cluster")
srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
# connect to gateway if this node is not the gw
elif (local_cluster(net_type, gw_cluster_id)
and not local_interface(net_type, gw_cluster_id, gw)):
srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
else:
return None
if not srvdb:
panic("no server for nid", lo)
return None
return Network(srvdb)
def prepare(self):
if not config.record and net_is_prepared():
return
self.info()
for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
lctl.add_route(net_type, gw, lo, hi)
srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
if srv:
lctl.connect(srv)
def safe_to_clean(self):
return not net_is_prepared()
def cleanup(self):
if net_is_prepared():
# the network is still being used, don't clean it up
return
for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
if srv:
try:
lctl.disconnect(srv)
except CommandError, e:
print "disconnect failed: ", self.name
e.dump()
cleanup_error(e.rc)
try:
lctl.del_route(net_type, gw, lo, hi)
except CommandError, e:
print "del_route failed: ", self.name
e.dump()
cleanup_error(e.rc)
class Management(Module):
def __init__(self, db):
Module.__init__(self, 'MGMT', db)
def add_module(self, manager):
manager.add_lustre_module('lvfs', 'lvfs')
manager.add_lustre_module('obdclass', 'obdclass')
manager.add_lustre_module('ptlrpc', 'ptlrpc')
manager.add_lustre_module('mgmt', 'mgmt_svc')
def prepare(self):
if not config.record and is_prepared(self.name):
return
self.info()
lctl.newdev("mgmt", self.name, self.uuid)
def safe_to_clean(self):
return 1
def cleanup(self):
if is_prepared(self.name):
Module.cleanup(self)
def correct_level(self, level, op=None):
return level
# This is only needed to load the modules; the LDLM device
# is now created automatically.
class LDLM(Module):
def __init__(self,db):
Module.__init__(self, 'LDLM', db)
def add_module(self, manager):
manager.add_lustre_module('lvfs', 'lvfs')
manager.add_lustre_module('obdclass', 'obdclass')
manager.add_lustre_module('ptlrpc', 'ptlrpc')
def prepare(self):
return
def cleanup(self):
return
def correct_level(self, level, op=None):
return level
class LOV(Module):
def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
Module.__init__(self, 'LOV', db)
if name_override != None:
self.name = "lov_%s" % name_override
self.mds_uuid = self.db.get_first_ref('mds')
self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
self.stripe_off = self.db.get_val_int('stripeoffset', 0)
self.pattern = self.db.get_val_int('stripepattern', 0)
self.devlist = self.db.get_lov_tgts('lov_tgt')
self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
self.osclist = []
self.obdlist = []
self.desc_uuid = self.uuid
self.uuid = generate_client_uuid(self.name)
self.fs_name = fs_name
if config_only:
self.config_only = 1
return
self.config_only = None
mds = self.db.lookup(self.mds_uuid)
self.mds_name = mds.getName()
for (obd_uuid, index, gen, active) in self.devlist:
if obd_uuid == '':
continue
self.obdlist.append(obd_uuid)
obd = self.db.lookup(obd_uuid)
osc = get_osc(obd, self.uuid, fs_name)
if osc:
self.osclist.append((osc, index, gen, active))
else:
panic('osc not found:', obd_uuid)
def get_uuid(self):
return self.uuid
def get_name(self):
return self.name
def prepare(self):
if not config.record and is_prepared(self.name):
return
self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
self.stripe_off, self.pattern, self.devlist,
self.mds_name)
lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
self.stripe_sz, self.stripe_off, self.pattern,
string.join(self.obdlist))
for (osc, index, gen, active) in self.osclist:
target_uuid = osc.target_uuid
try:
# Only ignore connect failures with --force, which
# isn't implemented here yet.
osc.active = active
osc.prepare(ignore_connect_failure=0)
except CommandError, e:
print "Error preparing OSC %s\n" % osc.uuid
raise e
lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
def cleanup(self):
for (osc, index, gen, active) in self.osclist:
target_uuid = osc.target_uuid
osc.cleanup()
if is_prepared(self.name):
Module.cleanup(self)
if self.config_only:
panic("Can't clean up config_only LOV ", self.name)
def add_module(self, manager):
if self.config_only:
panic("Can't load modules for config_only LOV ", self.name)
for (osc, index, gen, active) in self.osclist:
osc.add_module(manager)
break
manager.add_lustre_module('lov', 'lov')
def correct_level(self, level, op=None):
return level
class LMV(Module):
def __init__(self, db, uuid, fs_name, name_override = None):
Module.__init__(self, 'LMV', db)
if name_override != None:
self.name = "lmv_%s" % name_override
self.devlist = self.db.get_lmv_tgts('lmv_tgt')
if self.devlist == None:
self.devlist = self.db.get_refs('mds')
self.mdclist = []
self.desc_uuid = self.uuid
self.uuid = uuid
self.fs_name = fs_name
for mds_uuid in self.devlist:
mds = self.db.lookup(mds_uuid)
if not mds:
panic("MDS not found!")
mdc = MDC(mds, self.uuid, fs_name)
if mdc:
self.mdclist.append(mdc)
else:
panic('mdc not found:', mds_uuid)
def prepare(self):
if is_prepared(self.name):
return
self.info();
for mdc in self.mdclist:
try:
# Only ignore connect failures with --force, which
# isn't implemented here yet.
mdc.prepare(ignore_connect_failure=0)
except CommandError, e:
print "Error preparing LMV %s\n" % mdc.uuid
raise e
lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
string.join(self.devlist))
def cleanup(self):
for mdc in self.mdclist:
mdc.cleanup()
if is_prepared(self.name):
Module.cleanup(self)
def add_module(self, manager):
for mdc in self.mdclist:
mdc.add_module(manager)
break
manager.add_lustre_module('lmv', 'lmv')
def correct_level(self, level, op=None):
return level
class MDSDEV(Module):
def __init__(self,db):
Module.__init__(self, 'MDSDEV', db)
self.devpath = self.db.get_val('devpath','')
self.backdevpath = self.db.get_val('backdevpath','')
self.size = self.db.get_val_int('devsize', 0)
self.journal_size = self.db.get_val_int('journalsize', 0)
self.fstype = self.db.get_val('fstype', '')
self.backfstype = self.db.get_val('backfstype', '')
self.nspath = self.db.get_val('nspath', '')
self.mkfsoptions = self.db.get_val('mkfsoptions', '')
self.mountfsoptions = self.db.get_val('mountfsoptions', '')
self.obdtype = self.db.get_val('obdtype', '')
self.root_squash = self.db.get_val('root_squash', '')
self.no_root_squash = self.db.get_val('no_root_squash', '')
# overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
target_uuid = self.db.get_first_ref('target')
self.mds = self.db.lookup(target_uuid)
self.name = self.mds.getName()
self.client_uuids = self.mds.get_refs('client')
self.lmv = None
self.master = None
lmv_uuid = self.db.get_first_ref('lmv')
if lmv_uuid != None:
self.lmv = self.db.lookup(lmv_uuid)
if self.lmv != None:
self.client_uuids = self.lmv.get_refs('client')
# FIXME: if fstype not set, then determine based on kernel version
self.format = self.db.get_val('autoformat', "no")
if self.mds.get_val('failover', 0):
self.failover_mds = 'f'
else:
self.failover_mds = 'n'
active_uuid = get_active_target(self.mds)
if not active_uuid:
panic("No target device found:", target_uuid)
if active_uuid == self.uuid:
self.active = 1
else:
self.active = 0
if self.active and config.group and config.group != self.mds.get_val('group'):
self.active = 0
# default inode inode for case when neither LOV either
# LMV is accessible.
self.inode_size = 256
inode_size = self.db.get_val_int('inodesize', 0)
if not inode_size == 0:
self.inode_size = inode_size
else:
# find the LOV for this MDS
lovconfig_uuid = self.mds.get_first_ref('lovconfig')
if lovconfig_uuid or self.lmv != None:
if self.lmv != None:
lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
lovconfig = self.lmv.lookup(lovconfig_uuid)
lov_uuid = lovconfig.get_first_ref('lov')
if lov_uuid == None:
panic(self.mds.getName() + ": No LOV found for lovconfig ",
lovconfig.name)
else:
lovconfig = self.mds.lookup(lovconfig_uuid)
lov_uuid = lovconfig.get_first_ref('lov')
if lov_uuid == None:
panic(self.mds.getName() + ": No LOV found for lovconfig ",
lovconfig.name)
if self.lmv != None:
lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
lovconfig = self.lmv.lookup(lovconfig_uuid)
lov_uuid = lovconfig.get_first_ref('lov')
lov = LOV(self.db.lookup(lov_uuid), lov_uuid, self.name,
config_only = 1)
# default stripe count controls default inode_size
stripe_count = lov.stripe_cnt
if stripe_count > 77:
self.inode_size = 4096
elif stripe_count > 35:
self.inode_size = 2048
elif stripe_count > 13:
self.inode_size = 1024
elif stripe_count > 3:
self.inode_size = 512
else:
self.inode_size = 256
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
# setup LMV
if self.lmv != None:
client_uuid = self.name + "_lmv_UUID"
self.master = LMV(self.lmv, client_uuid,
self.name, self.name)
def add_module(self, manager):
if self.active:
manager.add_lustre_module('mdc', 'mdc')
manager.add_lustre_module('osc', 'osc')
manager.add_lustre_module('ost', 'ost')
manager.add_lustre_module('lov', 'lov')
manager.add_lustre_module('mds', 'mds')
if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
manager.add_lustre_module(self.fstype, self.fstype)
if self.fstype:
manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
# if fstype is smfs, then we should also take care about backing
# store fs.
if self.fstype == 'smfs':
manager.add_lustre_module(self.backfstype, self.backfstype)
manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
for option in string.split(self.mountfsoptions, ','):
if option == 'snap':
if not self.fstype == 'smfs':
panic("mountoptions has 'snap', but fstype is not smfs.")
manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
# add LMV modules
if self.master != None:
self.master.add_module(manager)
def get_mount_options(self, blkdev):
options = def_mount_options(self.fstype, 'mds')
if config.mountfsoptions:
if options:
options = "%s,%s" %(options, config.mountfsoptions)
else:
options = config.mountfsoptions
if self.mountfsoptions:
options = "%s,%s" %(options, self.mountfsoptions)
else:
if self.mountfsoptions:
if options:
options = "%s,%s" %(options, self.mountfsoptions)
else:
options = self.mountfsoptions
if self.fstype == 'smfs':
if options:
options = "%s,type=%s,dev=%s" %(options,
self.backfstype, blkdev)
else:
options = "type=%s,dev=%s" %(self.backfstype, blkdev)
return options
def prepare(self):
if not config.record and is_prepared(self.name):
return
if not self.active:
debug(self.uuid, "not active")
return
if config.reformat:
# run write_conf automatically, if --reformat used
self.write_conf()
run_acceptors()
# prepare LMV
if self.master != None:
self.master.prepare()
# never reformat here
blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
self.format, self.journal_size, self.inode_size,
self.mkfsoptions, self.backfstype, self.backdevpath)
if not is_prepared('MDT'):
lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
try:
if self.fstype == 'smfs':
realdev = self.fstype
else:
realdev = blkdev
if self.obdtype == None:
self.obdtype = 'dumb'
if self.master == None:
master_name = 'dumb'
else:
master_name = self.master.name
if self.client_uuids == None:
profile_name = 'dumb'
else:
profile_name = self.name
mountfsoptions = self.get_mount_options(blkdev)
self.info("mds", realdev, mountfsoptions, self.fstype, self.size,
self.format, master_name, profile_name, self.obdtype)
lctl.newdev("mds", self.name, self.uuid,
setup = "%s %s %s %s %s %s" %(realdev,
self.fstype, profile_name, mountfsoptions,
master_name, self.obdtype))
if development_mode():
procentry = "/proc/fs/lustre/mds/grp_hash_upcall"
upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/l_getgroups")
if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
print "MDS Warning: failed to set group-hash upcall"
else:
run("echo ", upcall, " > ", procentry)
except CommandError, e:
if e.rc == 2:
panic("MDS is missing the config log. Need to run " +
"lconf --write_conf.")
else:
raise e
if config.root_squash == None:
config.root_squash = self.root_squash
if config.no_root_squash == None:
config.no_root_squash = self.no_root_squash
if config.root_squash:
if config.no_root_squash:
nsnid = config.no_root_squash
else:
nsnid = "0"
lctl.root_squash(self.name, config.root_squash, nsnid)
def write_conf(self):
if not self.client_uuids:
return 0
do_cleanup = 0
if not is_prepared(self.name):
blkdev = block_dev(self.devpath, self.size, self.fstype,
config.reformat, self.format, self.journal_size,
self.inode_size, self.mkfsoptions,
self.backfstype, self.backdevpath)
if self.fstype == 'smfs':
realdev = self.fstype
else:
realdev = blkdev
# Even for writing logs we mount mds with supplied mount options
# because it will not mount smfs (if used) otherwise.
mountfsoptions = self.get_mount_options(blkdev)
if self.obdtype == None:
self.obdtype = 'dumb'
self.info("mds", realdev, mountfsoptions, self.fstype, self.size,
self.format, "dumb", "dumb", self.obdtype)
lctl.newdev("mds", self.name, self.uuid,
setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
'dumb', mountfsoptions,
'dumb', self.obdtype))
do_cleanup = 1
# record logs for all MDS clients
for obd_uuid in self.client_uuids:
log("recording client:", obd_uuid)
client_uuid = generate_client_uuid(self.name)
client = VOSC(self.db.lookup(obd_uuid), client_uuid,
self.name, self.name)
config.record = 1
lctl.clear_log(self.name, self.name)
lctl.record(self.name, self.name)
client.prepare()
lctl.mount_option(self.name, client.get_name(), "")
lctl.end_record()
process_updates(self.db, self.name, self.name, client)
config.cleanup = 1
lctl.clear_log(self.name, self.name + '-clean')
lctl.record(self.name, self.name + '-clean')
client.cleanup()
lctl.del_mount_option(self.name)
lctl.end_record()
process_updates(self.db, self.name, self.name + '-clean', client)
config.cleanup = 0
config.record = 0
# record logs for each client
if config.noexec:
noexec_opt = '-n'
else:
noexec_opt = ''
if config.ldapurl:
config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
else:
config_options = CONFIG_FILE
for node_db in self.db.lookup_class('node'):
client_name = node_db.getName()
for prof_uuid in node_db.get_refs('profile'):
prof_db = node_db.lookup(prof_uuid)
# refactor this into a funtion to test "clientness"
# of a node.
for ref_class, ref_uuid in prof_db.get_all_refs():
if ref_class in ('mountpoint','echoclient'):
debug("recording", client_name)
old_noexec = config.noexec
config.noexec = 0
ret, out = run (sys.argv[0], noexec_opt,
" -v --record --nomod",
"--record_log", client_name,
"--record_device", self.name,
"--node", client_name,
config_options)
if config.verbose:
for s in out: log("record> ", string.strip(s))
ret, out = run (sys.argv[0], noexec_opt,
"--cleanup -v --record --nomod",
"--record_log", client_name + "-clean",
"--record_device", self.name,
"--node", client_name,
config_options)
if config.verbose:
for s in out: log("record> ", string.strip(s))
config.noexec = old_noexec
if do_cleanup:
try:
lctl.cleanup(self.name, self.uuid, 0, 0)
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
Module.cleanup(self)
clean_dev(self.devpath, self.fstype, self.backfstype,
self.backdevpath)
def msd_remaining(self):
out = lctl.device_list()
for s in out:
if string.split(s)[2] in ('mds',):
return 1
def safe_to_clean(self):
return self.active
def safe_to_clean_modules(self):
return not self.msd_remaining()
def cleanup(self):
if not self.active:
debug(self.uuid, "not active")
return
self.info()
if is_prepared(self.name):
try:
lctl.cleanup(self.name, self.uuid, config.force,
config.failover)
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
Module.cleanup(self)
# cleanup LMV
if self.master != None:
self.master.cleanup()
if not self.msd_remaining() and is_prepared('MDT'):
try:
lctl.cleanup("MDT", "MDT_UUID", config.force,
config.failover)
except CommandError, e:
print "cleanup failed: ", self.name
e.dump()
cleanup_error(e.rc)
clean_dev(self.devpath, self.fstype, self.backfstype,
self.backdevpath)
def correct_level(self, level, op=None):
#if self.master != None:
# level = level + 2
return level
class OSD(Module):
def __init__(self, db):
Module.__init__(self, 'OSD', db)
self.osdtype = self.db.get_val('osdtype')
self.devpath = self.db.get_val('devpath', '')
self.backdevpath = self.db.get_val('backdevpath', '')
self.size = self.db.get_val_int('devsize', 0)
self.journal_size = self.db.get_val_int('journalsize', 0)
self.inode_size = self.db.get_val_int('inodesize', 0)
self.mkfsoptions = self.db.get_val('mkfsoptions', '')
self.mountfsoptions = self.db.get_val('mountfsoptions', '')
self.fstype = self.db.get_val('fstype', '')
self.backfstype = self.db.get_val('backfstype', '')
self.nspath = self.db.get_val('nspath', '')
target_uuid = self.db.get_first_ref('target')
ost = self.db.lookup(target_uuid)
self.name = ost.getName()
self.format = self.db.get_val('autoformat', 'yes')
if ost.get_val('failover', 0):
self.failover_ost = 'f'
else:
self.failover_ost = 'n'
active_uuid = get_active_target(ost)
if not active_uuid:
panic("No target device found:", target_uuid)
if active_uuid == self.uuid:
self.active = 1
else:
self.active = 0
if self.active and config.group and config.group != ost.get_val('group'):
self.active = 0
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
def add_module(self, manager):
if self.active:
manager.add_lustre_module('ost', 'ost')
if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
manager.add_lustre_module(self.fstype, self.fstype)
if self.fstype:
manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
if self.fstype == 'smfs':
manager.add_lustre_module(self.backfstype, self.backfstype)
manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
for option in self.mountfsoptions:
if option == 'snap':
if not self.fstype == 'smfs':
panic("mountoptions with snap, but fstype is not smfs\n")
manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
manager.add_lustre_module(self.osdtype, self.osdtype)
def get_mount_options(self, blkdev):
options = def_mount_options(self.fstype, 'ost')
if config.mountfsoptions:
if options:
options = "%s,%s" %(options, config.mountfsoptions)
else:
options = config.mountfsoptions
if self.mountfsoptions:
options = "%s,%s" %(options, self.mountfsoptions)
else:
if self.mountfsoptions:
if options:
options = "%s,%s" %(options, self.mountfsoptions)
else:
options = self.mountfsoptions
if self.fstype == 'smfs':
if options:
options = "%s,type=%s,dev=%s" %(options,
self.backfstype, blkdev)
else:
options = "type=%s,dev=%s" %(self.backfstype,
blkdev)
return options
# need to check /proc/mounts and /etc/mtab before
# formatting anything.
# FIXME: check if device is already formatted.
def prepare(self):
if is_prepared(self.name):
return
if not self.active:
debug(self.uuid, "not active")
return
run_acceptors()
if self.osdtype == 'obdecho':
blkdev = ''
else:
blkdev = block_dev(self.devpath, self.size, self.fstype,
config.reformat, self.format, self.journal_size,
self.inode_size, self.mkfsoptions, self.backfstype,
self.backdevpath)
if self.fstype == 'smfs':
realdev = self.fstype
else:
realdev = blkdev
mountfsoptions = self.get_mount_options(blkdev)
self.info(self.osdtype, realdev, mountfsoptions, self.fstype,
self.size, self.format, self.journal_size, self.inode_size)
lctl.newdev(self.osdtype, self.name, self.uuid,
setup ="%s %s %s %s" %(realdev, self.fstype,
self.failover_ost,
mountfsoptions))
if not is_prepared('OSS'):
lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
def osd_remaining(self):
out = lctl.device_list()
for s in out:
if string.split(s)[2] in ('obdfilter', 'obdecho'):
return 1
def safe_to_clean(self):
return self.active
def safe_to_clean_modules(self):
return not self.osd_remaining()
def cleanup(self):
if not self.active:
debug(self.uuid, "not active")
return
if is_prepared(self.name):
self.info()
try:
lctl.cleanup(self.name, self.uuid, config.force,
config.failover)
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
if not self.osd_remaining() and is_prepared('OSS'):
try:
lctl.cleanup("OSS", "OSS_UUID", config.force,
config.failover)
except CommandError, e:
print "cleanup failed: ", self.name
e.dump()
cleanup_error(e.rc)
if not self.osdtype == 'obdecho':
clean_dev(self.devpath, self.fstype, self.backfstype,
self.backdevpath)
def correct_level(self, level, op=None):
return level
def mgmt_uuid_for_fs(mtpt_name):
if not mtpt_name:
return ''
mtpt_db = toplustreDB.lookup_name(mtpt_name)
fs_uuid = mtpt_db.get_first_ref('filesystem')
fs = toplustreDB.lookup(fs_uuid)
if not fs:
return ''
return fs.get_first_ref('mgmt')
# Generic client module, used by OSC and MDC
class Client(Module):
def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
module_dir=None):
self.target_name = tgtdb.getName()
self.target_uuid = tgtdb.getUUID()
self.module_dir = module_dir
self.module = module
self.db = tgtdb
self.active = 1
self.tgt_dev_uuid = get_active_target(tgtdb)
if not self.tgt_dev_uuid:
panic("No target device found for target(1):", self.target_name)
self._server = None
self._connected = 0
self.module = module
self.module_name = string.upper(module)
if not self_name:
self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
self.target_name, fs_name)
else:
self.name = self_name
self.uuid = uuid
self.lookup_server(self.tgt_dev_uuid)
mgmt_uuid = mgmt_uuid_for_fs(fs_name)
if mgmt_uuid:
self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
else:
self.mgmt_name = ''
self.fs_name = fs_name
if not self.module_dir:
self.module_dir = module
def add_module(self, manager):
manager.add_lustre_module(self.module_dir, self.module)
def lookup_server(self, srv_uuid):
""" Lookup a server's network information """
self._server_nets = get_ost_net(self.db, srv_uuid)
if len(self._server_nets) == 0:
panic ("Unable to find a server for:", srv_uuid)
def get_name(self):
return self.name
def get_servers(self):
return self._server_nets
def prepare(self, ignore_connect_failure = 0):
self.info(self.target_uuid)
if not config.record and is_prepared(self.name):
self.cleanup()
try:
srv = choose_local_server(self.get_servers())
if srv:
lctl.connect(srv)
else:
routes = find_route(self.get_servers())
if len(routes) == 0:
panic ("no route to", self.target_uuid)
for (srv, r) in routes:
lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
except CommandError, e:
if not ignore_connect_failure:
raise e
if srv:
if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
debug("%s inactive" % self.target_uuid)
inactive_p = "inactive"
else:
debug("%s active" % self.target_uuid)
inactive_p = ""
lctl.newdev(self.module, self.name, self.uuid,
setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
inactive_p, self.mgmt_name))
def cleanup(self):
if is_prepared(self.name):
Module.cleanup(self)
try:
srv = choose_local_server(self.get_servers())
if srv:
lctl.disconnect(srv)
else:
for (srv, r) in find_route(self.get_servers()):
lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
def correct_level(self, level, op=None):
return level
def deactivate(self):
try:
lctl.deactivate(self.name)
except CommandError, e:
log(self.module_name, "deactivate failed: ", self.name)
e.dump()
cleanup_error(e.rc)
class MDC(Client):
def __init__(self, db, uuid, fs_name):
Client.__init__(self, db, uuid, 'mdc', fs_name)
def permits_inactive(self):
return 0
class OSC(Client):
def __init__(self, db, uuid, fs_name):
Client.__init__(self, db, uuid, 'osc', fs_name)
def permits_inactive(self):
return 1
def mgmtcli_name_for_uuid(uuid):
return 'MGMTCLI_%s' % uuid
class ManagementClient(Client):
def __init__(self, db, uuid):
Client.__init__(self, db, uuid, 'mgmt_cli', '',
self_name = mgmtcli_name_for_uuid(db.getUUID()),
module_dir = 'mgmt')
class CMOBD(Module):
def __init__(self, db):
Module.__init__(self, 'CMOBD', db)
self.name = self.db.getName();
self.uuid = generate_client_uuid(self.name)
self.master_uuid = self.db.get_first_ref('masterobd')
self.cache_uuid = self.db.get_first_ref('cacheobd')
master_obd = self.db.lookup(self.master_uuid)
if not master_obd:
panic('master obd not found:', self.master_uuid)
cache_obd = self.db.lookup(self.cache_uuid)
if not cache_obd:
panic('cache obd not found:', self.cache_uuid)
self.master = None
self.cache = None
master_class = master_obd.get_class()
cache_class = cache_obd.get_class()
if master_class == 'ost' or master_class == 'lov':
client_uuid = "%s_lov_master_UUID" % (self.name)
self.master = LOV(master_obd, client_uuid, self.name);
elif master_class == 'mds':
self.master = get_mdc(db, self.name, self.master_uuid)
elif master_class == 'lmv':
client_uuid = "%s_lmv_master_UUID" % (self.name)
self.master = LMV(master_obd, client_uuid, self.name);
else:
panic("unknown master obd class '%s'" %(master_class))
if cache_class == 'ost' or cache_class == 'lov':
client_uuid = "%s_lov_cache_UUID" % (self.name)
self.cache = LOV(cache_obd, client_uuid, self.name);
elif cache_class == 'mds':
self.cache = get_mdc(db, self.name, self.cache_uuid)
elif cache_class == 'lmv':
client_uuid = "%s_lmv_cache_UUID" % (self.name)
self.cache = LMV(cache_obd, client_uuid, self.name);
else:
panic("unknown cache obd class '%s'" %(cache_class))
def prepare(self):
self.master.prepare()
if not config.record and is_prepared(self.name):
return
self.info(self.master_uuid, self.cache_uuid)
lctl.newdev("cmobd", self.name, self.uuid,
setup ="%s %s" %(self.master.uuid,
self.cache.uuid))
def get_uuid(self):
return self.uuid
def get_name(self):
return self.name
def get_master_name(self):
return self.master.name
def get_cache_name(self):
return self.cache.name
def cleanup(self):
if is_prepared(self.name):
Module.cleanup(self)
if self.master:
self.master.cleanup()
def add_module(self, manager):
manager.add_lustre_module('cmobd', 'cmobd')
self.master.add_module(manager)
def correct_level(self, level, op=None):
return level
class COBD(Module):
def __init__(self, db, uuid, name):
Module.__init__(self, 'COBD', db)
self.name = self.db.getName();
self.uuid = generate_client_uuid(self.name)
self.master_uuid = self.db.get_first_ref('masterobd')
self.cache_uuid = self.db.get_first_ref('cacheobd')
master_obd = self.db.lookup(self.master_uuid)
if not master_obd:
panic('master obd not found:', self.master_uuid)
cache_obd = self.db.lookup(self.cache_uuid)
if not cache_obd:
panic('cache obd not found:', self.cache_uuid)
self.master = None
self.cache = None
master_class = master_obd.get_class()
cache_class = cache_obd.get_class()
if master_class == 'ost' or master_class == 'lov':
client_uuid = "%s_lov_master_UUID" % (self.name)
self.master = LOV(master_obd, client_uuid, name);
elif master_class == 'mds':
self.master = get_mdc(db, name, self.master_uuid)
elif master_class == 'lmv':
client_uuid = "%s_lmv_master_UUID" % (self.name)
self.master = LMV(master_obd, client_uuid, self.name);
else:
panic("unknown master obd class '%s'" %(master_class))
if cache_class == 'ost' or cache_class == 'lov':
client_uuid = "%s_lov_cache_UUID" % (self.name)
self.cache = LOV(cache_obd, client_uuid, name);
elif cache_class == 'mds':
self.cache = get_mdc(db, name, self.cache_uuid)
elif cache_class == 'lmv':
client_uuid = "%s_lmv_cache_UUID" % (self.name)
self.cache = LMV(cache_obd, client_uuid, self.name);
else:
panic("unknown cache obd class '%s'" %(cache_class))
def get_uuid(self):
return self.uuid
def get_name(self):
return self.name
def get_master_name(self):
return self.master.name
def get_cache_name(self):
return self.cache.name
def prepare(self):
self.master.prepare()
self.cache.prepare()
if not config.record and is_prepared(self.name):
return
self.info(self.master_uuid, self.cache_uuid)
lctl.newdev("cobd", self.name, self.uuid,
setup ="%s %s" %(self.master.name,
self.cache.name))
def cleanup(self):
if is_prepared(self.name):
Module.cleanup(self)
self.master.cleanup()
self.cache.cleanup()
def add_module(self, manager):
manager.add_lustre_module('cobd', 'cobd')
self.master.add_module(manager)
# virtual interface for OSC and LOV
class VOSC(Module):
def __init__(self, db, client_uuid, name, name_override = None):
Module.__init__(self, 'VOSC', db)
if db.get_class() == 'lov':
self.osc = LOV(db, client_uuid, name, name_override)
self.type = 'lov'
elif db.get_class() == 'cobd':
self.osc = COBD(db, client_uuid, name)
self.type = 'cobd'
else:
self.osc = OSC(db, client_uuid, name)
self.type = 'osc'
def get_uuid(self):
return self.osc.get_uuid()
def get_name(self):
return self.osc.get_name()
def prepare(self):
self.osc.prepare()
def cleanup(self):
self.osc.cleanup()
def add_module(self, manager):
self.osc.add_module(manager)
def correct_level(self, level, op=None):
return self.osc.correct_level(level, op)
# virtual interface for MDC and LMV
class VMDC(Module):
def __init__(self, db, client_uuid, name, name_override = None):
Module.__init__(self, 'VMDC', db)
if db.get_class() == 'lmv':
self.mdc = LMV(db, client_uuid, name, name_override)
elif db.get_class() == 'cobd':
self.mdc = COBD(db, client_uuid, name)
else:
self.mdc = MDC(db, client_uuid, name)
def get_uuid(self):
return self.mdc.uuid
def get_name(self):
return self.mdc.name
def prepare(self):
self.mdc.prepare()
def cleanup(self):
self.mdc.cleanup()
def add_module(self, manager):
self.mdc.add_module(manager)
def correct_level(self, level, op=None):
return self.mdc.correct_level(level, op)
class ECHO_CLIENT(Module):
def __init__(self,db):
Module.__init__(self, 'ECHO_CLIENT', db)
self.obd_uuid = self.db.get_first_ref('obd')
obd = self.db.lookup(self.obd_uuid)
self.uuid = generate_client_uuid(self.name)
self.osc = VOSC(obd, self.uuid, self.name)
def prepare(self):
if not config.record and is_prepared(self.name):
return
run_acceptors()
self.osc.prepare() # XXX This is so cheating. -p
self.info(self.obd_uuid)
lctl.newdev("echo_client", self.name, self.uuid,
setup = self.osc.get_name())
def cleanup(self):
if is_prepared(self.name):
Module.cleanup(self)
self.osc.cleanup()
def add_module(self, manager):
self.osc.add_module(manager)
manager.add_lustre_module('obdecho', 'obdecho')
def correct_level(self, level, op=None):
return level
def generate_client_uuid(name):
client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
name,
int(random.random() * 1048576),
int(random.random() * 1048576))
return client_uuid[:36]
class Mountpoint(Module):
def __init__(self,db):
Module.__init__(self, 'MTPT', db)
self.path = self.db.get_val('path')
self.clientoptions = self.db.get_val('clientoptions', '')
self.fs_uuid = self.db.get_first_ref('filesystem')
fs = self.db.lookup(self.fs_uuid)
self.mds_uuid = fs.get_first_ref('lmv')
if not self.mds_uuid:
self.mds_uuid = fs.get_first_ref('mds')
self.obd_uuid = fs.get_first_ref('obd')
self.mgmt_uuid = fs.get_first_ref('mgmt')
client_uuid = generate_client_uuid(self.name)
ost = self.db.lookup(self.obd_uuid)
if not ost:
panic("no ost: ", self.obd_uuid)
mds = self.db.lookup(self.mds_uuid)
if not mds:
panic("no mds: ", self.mds_uuid)
self.vosc = VOSC(ost, client_uuid, self.name, self.name)
self.vmdc = VMDC(mds, client_uuid, self.name, self.name)
if self.mgmt_uuid:
self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
client_uuid)
else:
self.mgmtcli = None
def prepare(self):
if not config.record and fs_is_mounted(self.path):
log(self.path, "already mounted.")
return
run_acceptors()
if self.mgmtcli:
self.mgmtcli.prepare()
self.vosc.prepare()
self.vmdc.prepare()
vmdc_name = self.vmdc.get_name()
self.info(self.path, self.mds_uuid, self.obd_uuid)
if config.record or config.lctl_dump:
lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
return
if config.clientoptions:
if self.clientoptions:
self.clientoptions = self.clientoptions + ',' + \
config.clientoptions
else:
self.clientoptions = config.clientoptions
if self.clientoptions:
self.clientoptions = ',' + self.clientoptions
# Linux kernel will deal with async and not pass it to ll_fill_super,
# so replace it with Lustre async
self.clientoptions = string.replace(self.clientoptions, "async",
"lasync")
cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
(self.vosc.get_name(), vmdc_name, self.clientoptions,
config.config, self.path)
run("mkdir", self.path)
ret, val = run(cmd)
if ret:
self.vmdc.cleanup()
self.vosc.cleanup()
panic("mount failed:", self.path, ":", string.join(val))
def cleanup(self):
self.info(self.path, self.mds_uuid,self.obd_uuid)
if config.record or config.lctl_dump:
lctl.del_mount_option(local_node_name)
else:
if fs_is_mounted(self.path):
if config.force:
(rc, out) = run("umount", "-f", self.path)
else:
(rc, out) = run("umount", self.path)
if rc:
raise CommandError('umount', out, rc)
if fs_is_mounted(self.path):
panic("fs is still mounted:", self.path)
self.vmdc.cleanup()
self.vosc.cleanup()
if self.mgmtcli:
self.mgmtcli.cleanup()
def add_module(self, manager):
manager.add_lustre_module('mdc', 'mdc')
if self.mgmtcli:
self.mgmtcli.add_module(manager)
self.vosc.add_module(manager)
self.vmdc.add_module(manager)
manager.add_lustre_module('llite', 'llite')
def correct_level(self, level, op=None):
return level
# ============================================================
# misc query functions
def get_ost_net(self, osd_uuid):
srv_list = []
if not osd_uuid:
return srv_list
osd = self.lookup(osd_uuid)
node_uuid = osd.get_first_ref('node')
node = self.lookup(node_uuid)
if not node:
panic("unable to find node for osd_uuid:", osd_uuid,
" node_ref:", node_uuid_)
for net_uuid in node.get_networks():
db = node.lookup(net_uuid)
srv_list.append(Network(db))
return srv_list
# the order of iniitailization is based on level.
def getServiceLevel(self):
type = self.get_class()
ret=0;
if type in ('network',):
ret = 5
elif type in ('routetbl',):
ret = 6
elif type in ('ldlm',):
ret = 20
elif type in ('osd', 'cobd'):
ret = 30
elif type in ('mdsdev',):
ret = 40
elif type in ('lmv',):
ret = 45
elif type in ('cmobd',):
ret = 50
elif type in ('mountpoint', 'echoclient'):
ret = 70
else:
panic("Unknown type: ", type)
if ret < config.minlevel or ret > config.maxlevel:
ret = 0
return ret
#
# return list of services in a profile. list is a list of tuples
# [(level, db_object),]
def getServices(self):
list = []
for ref_class, ref_uuid in self.get_all_refs():
servdb = self.lookup(ref_uuid)
if servdb:
level = getServiceLevel(servdb)
if level > 0:
list.append((level, servdb))
else:
panic('service not found: ' + ref_uuid)
list.sort()
return list
############################################################
# MDC UUID hack -
# FIXME: clean this mess up!
#
# OSC is no longer in the xml, so we have to fake it.
# this is getting ugly and begging for another refactoring
def get_osc(ost_db, uuid, fs_name):
osc = OSC(ost_db, uuid, fs_name)
return osc
def get_mdc(db, fs_name, mds_uuid):
mds_db = db.lookup(mds_uuid);
if not mds_db:
error("no mds:", mds_uuid)
mdc = MDC(mds_db, mds_uuid, fs_name)
return mdc
############################################################
# routing ("rooting")
# list of (nettype, cluster_id, nid)
local_clusters = []
def find_local_clusters(node_db):
global local_clusters
for netuuid in node_db.get_networks():
net = node_db.lookup(netuuid)
srv = Network(net)
debug("add_local", netuuid)
local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
if srv.port > 0:
if acceptors.has_key(srv.port):
panic("duplicate port:", srv.port)
acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
# This node is a gateway.
is_router = 0
def node_is_router():
return is_router
# If there are any routers found in the config, then this will be true
# and all nodes will load kptlrouter.
needs_router = 0
def node_needs_router():
return needs_router or is_router
# list of (nettype, gw, tgt_cluster_id, lo, hi)
# Currently, these local routes are only added to kptlrouter route
# table if they are needed to connect to a specific server. This
# should be changed so all available routes are loaded, and the
# ptlrouter can make all the decisions.
local_routes = []
def find_local_routes(lustre):
""" Scan the lustre config looking for routers . Build list of
routes. """
global local_routes, needs_router
local_routes = []
list = lustre.lookup_class('node')
for router in list:
if router.get_val_int('router', 0):
needs_router = 1
for (local_type, local_cluster_id, local_nid) in local_clusters:
gw = None
for netuuid in router.get_networks():
db = router.lookup(netuuid)
if (local_type == db.get_val('nettype') and
local_cluster_id == db.get_val('clusterid')):
gw = db.get_val('nid')
break
if gw:
debug("find_local_routes: gw is", gw)
for route in router.get_local_routes(local_type, gw):
local_routes.append(route)
debug("find_local_routes:", local_routes)
def choose_local_server(srv_list):
for srv in srv_list:
if local_cluster(srv.net_type, srv.cluster_id):
return srv
def local_cluster(net_type, cluster_id):
for cluster in local_clusters:
if net_type == cluster[0] and cluster_id == cluster[1]:
return 1
return 0
def local_interface(net_type, cluster_id, nid):
for cluster in local_clusters:
if (net_type == cluster[0] and cluster_id == cluster[1]
and nid == cluster[2]):
return 1
return 0
def find_route(srv_list):
result = []
frm_type = local_clusters[0][0]
for srv in srv_list:
debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
to_type = srv.net_type
to = srv.nid
cluster_id = srv.cluster_id
debug ('looking for route to', to_type, to)
for r in local_routes:
debug("find_route: ", r)
if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
result.append((srv, r))
return result
def get_active_target(db):
target_uuid = db.getUUID()
target_name = db.getName()
node_name = get_select(target_name)
if node_name:
tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
else:
tgt_dev_uuid = db.get_first_ref('active')
return tgt_dev_uuid
def get_server_by_nid_uuid(db, nid_uuid):
for n in db.lookup_class("network"):
net = Network(n)
if net.nid_uuid == nid_uuid:
return net
############################################################
# lconf level logic
# Start a service.
def newService(db):
type = db.get_class()
debug('Service:', type, db.getName(), db.getUUID())
n = None
if type == 'ldlm':
n = LDLM(db)
elif type == 'lov':
n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
elif type == 'network':
n = Network(db)
elif type == 'routetbl':
n = RouteTable(db)
elif type == 'osd':
n = OSD(db)
elif type == 'cobd':
n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
elif type == 'cmobd':
n = CMOBD(db)
elif type == 'mdsdev':
n = MDSDEV(db)
elif type == 'mountpoint':
n = Mountpoint(db)
elif type == 'echoclient':
n = ECHO_CLIENT(db)
elif type == 'lmv':
n = LMV(db)
else:
panic ("unknown service type:", type)
return n
#
# Prepare the system to run lustre using a particular profile
# in a the configuration.
# * load & the modules
# * setup networking for the current node
# * make sure partitions are in place and prepared
# * initialize devices with lctl
# Levels is important, and needs to be enforced.
def for_each_profile(db, prof_list, operation):
for prof_uuid in prof_list:
prof_db = db.lookup(prof_uuid)
if not prof_db:
panic("profile:", prof_uuid, "not found.")
services = getServices(prof_db)
operation(services)
def magic_get_osc(db, rec, lov):
if lov:
lov_uuid = lov.get_uuid()
lov_name = lov.osc.fs_name
else:
lov_uuid = rec.getAttribute('lov_uuidref')
# FIXME: better way to find the mountpoint?
filesystems = db.root_node.getElementsByTagName('filesystem')
fsuuid = None
for fs in filesystems:
ref = fs.getElementsByTagName('obd_ref')
if ref[0].getAttribute('uuidref') == lov_uuid:
fsuuid = fs.getAttribute('uuid')
break
if not fsuuid:
panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
mtpts = db.root_node.getElementsByTagName('mountpoint')
lov_name = None
for fs in mtpts:
ref = fs.getElementsByTagName('filesystem_ref')
if ref[0].getAttribute('uuidref') == fsuuid:
lov_name = fs.getAttribute('name')
break
if not lov_name:
panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
ost_uuid = rec.getAttribute('ost_uuidref')
obd = db.lookup(ost_uuid)
if not obd:
panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
osc = get_osc(obd, lov_uuid, lov_name)
if not osc:
panic('osc not found:', obd_uuid)
return osc
# write logs for update records. sadly, logs of all types -- and updates in
# particular -- are something of an afterthought. lconf needs rewritten with
# these as core concepts. so this is a pretty big hack.
def process_update_record(db, update, lov):
for rec in update.childNodes:
if rec.nodeType != rec.ELEMENT_NODE:
continue
log("found "+rec.nodeName+" record in update version " +
str(update.getAttribute('version')))
lov_uuid = rec.getAttribute('lov_uuidref')
ost_uuid = rec.getAttribute('ost_uuidref')
index = rec.getAttribute('index')
gen = rec.getAttribute('generation')
if not lov_uuid or not ost_uuid or not index or not gen:
panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
if not lov:
tmplov = db.lookup(lov_uuid)
if not tmplov:
panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
lov_name = tmplov.getName()
else:
lov_name = lov.osc.name
# ------------------------------------------------------------- add
if rec.nodeName == 'add':
if config.cleanup:
lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
continue
osc = magic_get_osc(db, rec, lov)
try:
# Only ignore connect failures with --force, which
# isn't implemented here yet.
osc.prepare(ignore_connect_failure=0)
except CommandError, e:
print "Error preparing OSC %s\n" % osc.uuid
raise e
lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
# ------------------------------------------------------ deactivate
elif rec.nodeName == 'deactivate':
if config.cleanup:
continue
osc = magic_get_osc(db, rec, lov)
try:
osc.deactivate()
except CommandError, e:
print "Error deactivating OSC %s\n" % osc.uuid
raise e
# ---------------------------------------------------------- delete
elif rec.nodeName == 'delete':
if config.cleanup:
continue
osc = magic_get_osc(db, rec, lov)
try:
config.cleanup = 1
osc.cleanup()
config.cleanup = 0
except CommandError, e:
print "Error cleaning up OSC %s\n" % osc.uuid
raise e
lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
def process_updates(db, log_device, log_name, lov = None):
updates = db.root_node.getElementsByTagName('update')
for u in updates:
if not u.childNodes:
log("ignoring empty update record (version " +
str(u.getAttribute('version')) + ")")
continue
version = u.getAttribute('version')
real_name = "%s-%s" % (log_name, version)
lctl.clear_log(log_device, real_name)
lctl.record(log_device, real_name)
process_update_record(db, u, lov)
lctl.end_record()
def doWriteconf(services):
#if config.nosetup:
# return
for s in services:
if s[1].get_class() == 'mdsdev':
n = newService(s[1])
n.write_conf()
def doSetup(services):
if config.nosetup:
return
slist = []
for s in services:
n = newService(s[1])
n.level = s[0]
slist.append((n.level, n))
nlist = []
for n in slist:
nl = n[1].correct_level(n[0])
nlist.append((nl, n[1]))
nlist.sort()
for n in nlist:
n[1].prepare()
def doLoadModules(services):
if config.nomod:
return
# adding all needed modules from all services
for s in services:
n = newService(s[1])
n.add_module(mod_manager)
# loading all registered modules
mod_manager.load_modules()
def doUnloadModules(services):
if config.nomod:
return
# adding all needed modules from all services
for s in services:
n = newService(s[1])
if n.safe_to_clean_modules():
n.add_module(mod_manager)
# unloading all registered modules
mod_manager.cleanup_modules()
def doCleanup(services):
if config.nosetup:
return
slist = []
for s in services:
n = newService(s[1])
n.level = s[0]
slist.append((n.level, n))
nlist = []
for n in slist:
nl = n[1].correct_level(n[0])
nlist.append((nl, n[1]))
nlist.sort()
nlist.reverse()
for n in nlist:
if n[1].safe_to_clean():
n[1].cleanup()
#
# Load profile for
def doHost(lustreDB, hosts):
global is_router, local_node_name
node_db = None
for h in hosts:
node_db = lustreDB.lookup_name(h, 'node')
if node_db:
break
if not node_db:
panic('No host entry found.')
local_node_name = node_db.get_val('name', 0)
is_router = node_db.get_val_int('router', 0)
lustre_upcall = node_db.get_val('lustreUpcall', '')
portals_upcall = node_db.get_val('portalsUpcall', '')
timeout = node_db.get_val_int('timeout', 0)
ptldebug = node_db.get_val('ptldebug', '')
subsystem = node_db.get_val('subsystem', '')
find_local_clusters(node_db)
if not is_router:
find_local_routes(lustreDB)
# Two step process: (1) load modules, (2) setup lustre
# if not cleaning, load modules first.
prof_list = node_db.get_refs('profile')
if config.write_conf:
for_each_profile(node_db, prof_list, doLoadModules)
sys_make_devices()
for_each_profile(node_db, prof_list, doWriteconf)
for_each_profile(node_db, prof_list, doUnloadModules)
lustreDB.close()
elif config.recover:
if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
"--client_uuid <UUID> --conn_uuid <UUID>")
doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
config.conn_uuid)
elif config.cleanup:
if config.force:
# the command line can override this value
timeout = 5
# ugly hack, only need to run lctl commands for --dump
if config.lctl_dump or config.record:
for_each_profile(node_db, prof_list, doCleanup)
return
sys_set_timeout(timeout)
sys_set_ptldebug(ptldebug)
sys_set_subsystem(subsystem)
sys_set_lustre_upcall(lustre_upcall)
sys_set_portals_upcall(portals_upcall)
for_each_profile(node_db, prof_list, doCleanup)
for_each_profile(node_db, prof_list, doUnloadModules)
lustreDB.close()
else:
# ugly hack, only need to run lctl commands for --dump
if config.lctl_dump or config.record:
sys_set_timeout(timeout)
sys_set_lustre_upcall(lustre_upcall)
for_each_profile(node_db, prof_list, doSetup)
return
sys_make_devices()
sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
for_each_profile(node_db, prof_list, doLoadModules)
sys_set_debug_path()
sys_set_ptldebug(ptldebug)
sys_set_subsystem(subsystem)
script = config.gdb_script
run(lctl.lctl, ' modules >', script)
if config.gdb:
log ("The GDB module script is in", script)
# pause, so user has time to break and
# load the script
time.sleep(5)
sys_set_timeout(timeout)
sys_set_lustre_upcall(lustre_upcall)
sys_set_portals_upcall(portals_upcall)
for_each_profile(node_db, prof_list, doSetup)
lustreDB.close()
def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
tgt = lustreDB.lookup(tgt_uuid)
if not tgt:
raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
new_uuid = get_active_target(tgt)
if not new_uuid:
raise Lustre.LconfError("doRecovery: no active target found for: " +
tgt_uuid)
net = choose_local_server(get_ost_net(lustreDB, new_uuid))
if not net:
raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
try:
oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
lustreDB.close()
if oldnet:
lctl.disconnect(oldnet)
except CommandError, e:
log("recover: disconnect", nid_uuid, "failed: ")
e.dump()
try:
lctl.connect(net)
except CommandError, e:
log("recover: connect failed")
e.dump()
lctl.recover(client_uuid, net.nid_uuid)
def setupModulePath(cmd, portals_dir = PORTALS_DIR):
base = os.path.dirname(cmd)
if development_mode():
if not config.lustre:
debug('using objdir module paths')
config.lustre = (os.path.join(base, ".."))
# normalize the portals dir, using command line arg if set
if config.portals:
portals_dir = config.portals
dir = os.path.join(config.lustre, portals_dir)
config.portals = dir
debug('config.portals', config.portals)
elif config.lustre and config.portals:
# production mode
# if --lustre and --portals, normalize portals
# can ignore POTRALS_DIR here, since it is probly useless here
config.portals = os.path.join(config.lustre, config.portals)
debug('config.portals B', config.portals)
def sysctl(path, val):
debug("+ sysctl", path, val)
if config.noexec:
return
try:
fp = open(os.path.join('/proc/sys', path), 'w')
fp.write(str(val))
fp.close()
except IOError, e:
panic(str(e))
def sys_set_debug_path():
sysctl('portals/debug_path', config.debug_path)
def sys_set_lustre_upcall(upcall):
# the command overrides the value in the node config
if config.lustre_upcall:
upcall = config.lustre_upcall
elif config.upcall:
upcall = config.upcall
if upcall:
lctl.set_lustre_upcall(upcall)
def sys_set_portals_upcall(upcall):
# the command overrides the value in the node config
if config.portals_upcall:
upcall = config.portals_upcall
elif config.upcall:
upcall = config.upcall
if upcall:
sysctl('portals/upcall', upcall)
def sys_set_timeout(timeout):
# the command overrides the value in the node config
if config.timeout and config.timeout > 0:
timeout = config.timeout
if timeout != None and timeout > 0:
lctl.set_timeout(timeout)
def sys_tweak_socknal ():
# reserve at least 8MB, or we run out of RAM in skb_alloc under read
if sys_get_branch() == '2.6':
fp = open('/proc/meminfo')
lines = fp.readlines()
fp.close()
memtotal = 131072
for l in lines:
a = string.split(l)
if a[0] == 'MemTotal:':
memtotal = a[1]
debug("memtotal" + memtotal)
if int(memtotal) < 262144:
minfree = int(memtotal) / 16
else:
minfree = 32768
debug("+ minfree ", minfree)
sysctl("vm/min_free_kbytes", minfree)
if config.single_socket:
sysctl("socknal/typed", 0)
def sys_optimize_elan ():
procfiles = ["/proc/elan/config/eventint_punt_loops",
"/proc/qsnet/elan3/config/eventint_punt_loops",
"/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
for p in procfiles:
if os.access(p, os.W_OK):
run ("echo 1 > " + p)
def sys_set_ptldebug(ptldebug):
if config.ptldebug:
ptldebug = config.ptldebug
if ptldebug:
try:
val = eval(ptldebug, ptldebug_names)
val = "0x%x" % (val)
sysctl('portals/debug', val)
except NameError, e:
panic(str(e))
def sys_set_subsystem(subsystem):
if config.subsystem:
subsystem = config.subsystem
if subsystem:
try:
val = eval(subsystem, subsystem_names)
val = "0x%x" % (val)
sysctl('portals/subsystem_debug', val)
except NameError, e:
panic(str(e))
def sys_set_netmem_max(path, max):
debug("setting", path, "to at least", max)
if config.noexec:
return
fp = open(path)
str = fp.readline()
fp.close()
cur = int(str)
if max > cur:
fp = open(path, 'w')
fp.write('%d\n' %(max))
fp.close()
def sys_make_devices():
if not os.access('/dev/portals', os.R_OK):
run('mknod /dev/portals c 10 240')
if not os.access('/dev/obd', os.R_OK):
run('mknod /dev/obd c 10 241')
# Add dir to the global PATH, if not already there.
def add_to_path(new_dir):
syspath = string.split(os.environ['PATH'], ':')
if new_dir in syspath:
return
os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
def default_debug_path():
path = '/tmp/lustre-log'
if os.path.isdir('/r'):
return '/r' + path
else:
return path
def default_gdb_script():
script = '/tmp/ogdb'
if os.path.isdir('/r'):
return '/r' + script
else:
return script
DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
# ensure basic elements are in the system path
def sanitise_path():
for dir in DEFAULT_PATH:
add_to_path(dir)
# global hack for the --select handling
tgt_select = {}
def init_select(args):
# args = [service=nodeA,service2=nodeB service3=nodeC]
global tgt_select
for arg in args:
list = string.split(arg, ',')
for entry in list:
srv, node = string.split(entry, '=')
tgt_select[srv] = node
def get_select(srv):
if tgt_select.has_key(srv):
return tgt_select[srv]
return None
FLAG = Lustre.Options.FLAG
PARAM = Lustre.Options.PARAM
INTPARAM = Lustre.Options.INTPARAM
PARAMLIST = Lustre.Options.PARAMLIST
lconf_options = [
('verbose,v', "Print system commands as they are run"),
('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
('config', "Cluster config name used for LDAP query", PARAM),
('select', "service=nodeA,service2=nodeB ", PARAMLIST),
('node', "Load config for <nodename>", PARAM),
('cleanup,d', "Cleans up config. (Shutdown)"),
('force,f', "Forced unmounting and/or obd detach during cleanup",
FLAG, 0),
('single_socket', "socknal option: only use one socket instead of bundle",
FLAG, 0),
('failover',"""Used to shut down without saving state.
This will allow this node to "give up" a service to a
another node for failover purposes. This will not
be a clean shutdown.""",
FLAG, 0),
('gdb', """Prints message after creating gdb module script
and sleeps for 5 seconds."""),
('noexec,n', """Prints the commands and steps that will be run for a
config without executing them. This can used to check if a
config file is doing what it should be doing"""),
('nomod', "Skip load/unload module step."),
('nosetup', "Skip device setup/cleanup step."),
('reformat', "Reformat all devices (without question)"),
('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
('mountfsoptions', "Additional options for mount fs command line", PARAM),
('clientoptions', "Additional options for Lustre", PARAM),
('dump', "Dump the kernel debug log to file before portals is unloaded",
PARAM),
('write_conf', "Save all the client config information on mds."),
('record', "Write config information on mds."),
('record_log', "Name of config record log.", PARAM),
('record_device', "MDS device name that will record the config commands",
PARAM),
('root_squash', "MDS squash root to appointed uid",
PARAM),
('no_root_squash', "Don't squash root for appointed nid",
PARAM),
('minlevel', "Minimum level of services to configure/cleanup",
INTPARAM, 0),
('maxlevel', """Maximum level of services to configure/cleanup
Levels are aproximatly like:
10 - netwrk
20 - device, ldlm
30 - osd, mdd
40 - mds, ost
70 - mountpoint, echo_client, osc, mdc, lov""",
INTPARAM, 100),
('lustre', """Base directory of lustre sources. This parameter will
cause lconf to load modules from a source tree.""", PARAM),
('portals', """Portals source directory. If this is a relative path,
then it is assumed to be relative to lustre. """, PARAM),
('timeout', "Set recovery timeout", INTPARAM),
('upcall', "Set both portals and lustre upcall script", PARAM),
('lustre_upcall', "Set lustre upcall script", PARAM),
('portals_upcall', "Set portals upcall script", PARAM),
('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
('ptldebug', "Set the portals debug level", PARAM),
('subsystem', "Set the portals debug subsystem", PARAM),
('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
# Client recovery options
('recover', "Recover a device"),
('group', "The group of devices to configure or cleanup", PARAM),
('tgt_uuid', "The failed target (required for recovery)", PARAM),
('client_uuid', "The failed client (required for recovery)", PARAM),
('conn_uuid', "The failed connection (required for recovery)", PARAM),
('inactive', """The name of an inactive service, to be ignored during
mounting (currently OST-only). Can be repeated.""",
PARAMLIST),
]
def main():
global lctl, config, toplustreDB, CONFIG_FILE, mod_manager
# in the upcall this is set to SIG_IGN
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
cl = Lustre.Options("lconf", "config.xml", lconf_options)
try:
config, args = cl.parse(sys.argv[1:])
except Lustre.OptionError, e:
print e
sys.exit(1)
setupModulePath(sys.argv[0])
host = socket.gethostname()
# the PRNG is normally seeded with time(), which is not so good for starting
# time-synchronized clusters
input = open('/dev/urandom', 'r')
if not input:
print 'Unable to open /dev/urandom!'
sys.exit(1)
seed = input.read(32)
input.close()
random.seed(seed)
sanitise_path()
init_select(config.select)
if len(args) > 0:
# allow config to be fetched via HTTP, but only with python2
if sys.version[0] != '1' and args[0].startswith('http://'):
import urllib2
try:
config_file = urllib2.urlopen(args[0])
except (urllib2.URLError, socket.error), err:
if hasattr(err, 'args'):
err = err.args[1]
print "Could not access '%s': %s" %(args[0], err)
sys.exit(1)
elif not os.access(args[0], os.R_OK):
print 'File not found or readable:', args[0]
sys.exit(1)
else:
# regular file
config_file = open(args[0], 'r')
try:
dom = xml.dom.minidom.parse(config_file)
except Exception:
panic("%s does not appear to be a config file." % (args[0]))
sys.exit(1) # make sure to die here, even in debug mode.
config_file.close()
CONFIG_FILE = args[0]
lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
if not config.config:
config.config = os.path.basename(args[0])# use full path?
if config.config[-4:] == '.xml':
config.config = config.config[:-4]
elif config.ldapurl:
if not config.config:
panic("--ldapurl requires --config name")
dn = "config=%s,fs=lustre" % (config.config)
lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
elif config.ptldebug or config.subsystem:
sys_set_ptldebug(None)
sys_set_subsystem(None)
sys.exit(0)
else:
print 'Missing config file or ldap URL.'
print 'see lconf --help for command summary'
sys.exit(1)
toplustreDB = lustreDB
ver = lustreDB.get_version()
if not ver:
panic("No version found in config data, please recreate.")
if ver != Lustre.CONFIG_VERSION:
panic("Config version", ver, "does not match lconf version",
Lustre.CONFIG_VERSION)
node_list = []
if config.node:
node_list.append(config.node)
else:
if len(host) > 0:
node_list.append(host)
node_list.append('localhost')
debug("configuring for host: ", node_list)
if len(host) > 0:
config.debug_path = config.debug_path + '-' + host
config.gdb_script = config.gdb_script + '-' + host
lctl = LCTLInterface('lctl')
if config.lctl_dump:
lctl.use_save_file(config.lctl_dump)
if config.record:
if not (config.record_device and config.record_log):
panic("When recording, both --record_log and --record_device must be specified.")
lctl.clear_log(config.record_device, config.record_log)
lctl.record(config.record_device, config.record_log)
# init module manager
mod_manager = kmod_manager(config.lustre, config.portals)
doHost(lustreDB, node_list)
if not config.record:
return
lctl.end_record()
process_updates(lustreDB, config.record_device, config.record_log)
if __name__ == "__main__":
try:
main()
except Lustre.LconfError, e:
print e
# traceback.print_exc(file=sys.stdout)
sys.exit(1)
except CommandError, e:
e.dump()
sys.exit(e.rc)
if first_cleanup_error:
sys.exit(first_cleanup_error)