Commit 2548cb9e authored by Patrick Farrell's avatar Patrick Farrell Committed by Oleg Drokin
Browse files

LU-11670 osc: glimpse - search for active lock

When there are lock-ahead write locks on a file, the server
sends one glimpse AST RPC to each client having such (it
may have many) locks. This callback is sent to the lock
having the highest offset.

Client's glimpse callback goes up to the clio layers and
gets the global (not lock-specific) view of size.  The clio
layers are connected to the extent lock through the
l_ast_data (which points to the OSC object).

Speculative locks (AGL, lockahead) do not have l_ast_data
initialised until an IO happens under the lock. Thus, some
speculative locks may not have l_ast_data initialized.

It is possible for the client to do a write using one lock
(changing file size), but for the glimpse AST to be sent to
another lock without l_ast_data initialized.  Currently, a
lock with no l_ast_data set returns ELDLM_NO_LOCK_DATA to
the server.  In this case, this means we do not return the
updated size.

The solution is to search the granted lock tree for any lock
with initialized l_ast_data (it points to the OSC object
which is the same for all the extent locks) and to reach the
clio layers for the size through this lock instead.

Lustre-change: https://review.whamcloud.com/33660
Lustre-commit: b3461d11



cray-bug-id: LUS-6747
Signed-off-by: default avatarPatrick Farrell <pfarrell@whamcloud.com>
Change-Id: I6c60f4133154a3d6652315f155af24bbc5752dd2
Reviewed-by: default avatarAndreas Dilger <adilger@whamcloud.com>
Reviewed-by: default avatarBobi Jam <bobijam@hotmail.com>
Signed-off-by: default avatarMinh Diep <mdiep@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/36406

Tested-by: default avatarjenkins <devops@whamcloud.com>
Tested-by: default avatarMaloo <maloo@whamcloud.com>
Reviewed-by: default avatarOleg Drokin <green@whamcloud.com>
parent 810a9523
......@@ -962,6 +962,20 @@ struct ldlm_lock {
struct list_head l_exp_list;
};
/**
* Describe the overlap between two locks. itree_overlap_cb data.
*/
struct ldlm_match_data {
struct ldlm_lock *lmd_old;
struct ldlm_lock *lmd_lock;
enum ldlm_mode *lmd_mode;
union ldlm_policy_data *lmd_policy;
__u64 lmd_flags;
__u64 lmd_skip_flags;
int lmd_unref;
bool lmd_has_ast_data;
};
/** For uncommitted cross-MDT lock, store transno this lock belongs to */
#define l_transno l_client_cookie
......@@ -1539,7 +1553,8 @@ static inline enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns,
return ldlm_lock_match_with_skip(ns, flags, 0, res_id, type, policy,
mode, lh, unref);
}
struct ldlm_lock *search_itree(struct ldlm_resource *res,
struct ldlm_match_data *data);
enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh,
__u64 *bits);
void ldlm_lock_mode_downgrade(struct ldlm_lock *lock, enum ldlm_mode new_mode);
......
......@@ -414,6 +414,7 @@ extern char obd_jobid_var[];
#define OBD_FAIL_OSC_DELAY_SETTIME 0x412
#define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413
#define OBD_FAIL_OSC_DELAY_IO 0x414
#define OBD_FAIL_OSC_NO_SIZE_DATA 0x415
#define OBD_FAIL_PTLRPC 0x500
#define OBD_FAIL_PTLRPC_ACK 0x501
......
......@@ -1156,19 +1156,6 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
EXIT;
}
/**
* Describe the overlap between two locks. itree_overlap_cb data.
*/
struct lock_match_data {
struct ldlm_lock *lmd_old;
struct ldlm_lock *lmd_lock;
enum ldlm_mode *lmd_mode;
union ldlm_policy_data *lmd_policy;
__u64 lmd_flags;
__u64 lmd_skip_flags;
int lmd_unref;
};
/**
* Check if the given @lock meets the criteria for a match.
* A reference on the lock is taken if matched.
......@@ -1176,10 +1163,10 @@ struct lock_match_data {
* \param lock test-against this lock
* \param data parameters
*/
static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
static int lock_matches(struct ldlm_lock *lock, struct ldlm_match_data *data)
{
union ldlm_policy_data *lpol = &lock->l_policy_data;
enum ldlm_mode match;
enum ldlm_mode match = LCK_MINMODE;
if (lock == data->lmd_old)
return INTERVAL_ITER_STOP;
......@@ -1204,6 +1191,17 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
if (!(lock->l_req_mode & *data->lmd_mode))
return INTERVAL_ITER_CONT;
/* When we search for ast_data, we are not doing a traditional match,
* so we don't worry about IBITS or extent matching.
*/
if (data->lmd_has_ast_data) {
if (!lock->l_ast_data)
return INTERVAL_ITER_CONT;
goto matched;
}
match = lock->l_req_mode;
switch (lock->l_resource->lr_type) {
......@@ -1241,6 +1239,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
if (data->lmd_skip_flags & lock->l_flags)
return INTERVAL_ITER_CONT;
matched:
if (data->lmd_flags & LDLM_FL_TEST_LOCK) {
LDLM_LOCK_GET(lock);
ldlm_lock_touch_in_lru(lock);
......@@ -1257,7 +1256,7 @@ static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data)
static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
{
struct ldlm_interval *node = to_ldlm_interval(in);
struct lock_match_data *data = args;
struct ldlm_match_data *data = args;
struct ldlm_lock *lock;
int rc;
......@@ -1277,8 +1276,8 @@ static unsigned int itree_overlap_cb(struct interval_node *in, void *args)
*
* \retval a referenced lock or NULL.
*/
static struct ldlm_lock *search_itree(struct ldlm_resource *res,
struct lock_match_data *data)
struct ldlm_lock *search_itree(struct ldlm_resource *res,
struct ldlm_match_data *data)
{
struct interval_node_extent ext = {
.start = data->lmd_policy->l_extent.start,
......@@ -1300,6 +1299,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
}
return data->lmd_lock;
}
EXPORT_SYMBOL(search_itree);
/**
......@@ -1311,7 +1311,7 @@ static struct ldlm_lock *search_itree(struct ldlm_resource *res,
* \retval a referenced lock or NULL.
*/
static struct ldlm_lock *search_queue(struct list_head *queue,
struct lock_match_data *data)
struct ldlm_match_data *data)
{
struct ldlm_lock *lock;
int rc;
......@@ -1404,7 +1404,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
enum ldlm_mode mode,
struct lustre_handle *lockh, int unref)
{
struct lock_match_data data = {
struct ldlm_match_data data = {
.lmd_old = NULL,
.lmd_lock = NULL,
.lmd_mode = &mode,
......@@ -1412,6 +1412,7 @@ enum ldlm_mode ldlm_lock_match_with_skip(struct ldlm_namespace *ns,
.lmd_flags = flags,
.lmd_skip_flags = skip_flags,
.lmd_unref = unref,
.lmd_has_ast_data = false,
};
struct ldlm_resource *res;
struct ldlm_lock *lock;
......
......@@ -554,6 +554,10 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
struct ost_lvb *lvb;
struct req_capsule *cap;
struct cl_object *obj = NULL;
struct ldlm_resource *res = dlmlock->l_resource;
struct ldlm_match_data matchdata = { 0 };
union ldlm_policy_data policy;
enum ldlm_mode mode = LCK_PW | LCK_GROUP | LCK_PR;
int result;
__u16 refcheck;
......@@ -565,13 +569,40 @@ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
if (IS_ERR(env))
GOTO(out, result = PTR_ERR(env));
policy.l_extent.start = 0;
policy.l_extent.end = LUSTRE_EOF;
lock_res_and_lock(dlmlock);
if (dlmlock->l_ast_data != NULL) {
obj = osc2cl(dlmlock->l_ast_data);
cl_object_get(obj);
matchdata.lmd_mode = &mode;
matchdata.lmd_policy = &policy;
matchdata.lmd_flags = LDLM_FL_TEST_LOCK | LDLM_FL_CBPENDING;
matchdata.lmd_unref = 1;
matchdata.lmd_has_ast_data = true;
LDLM_LOCK_GET(dlmlock);
/* If any dlmlock has l_ast_data set, we must find it or we risk
* missing a size update done under a different lock.
*/
while (dlmlock) {
lock_res_and_lock(dlmlock);
if (dlmlock->l_ast_data) {
obj = osc2cl(dlmlock->l_ast_data);
cl_object_get(obj);
}
unlock_res_and_lock(dlmlock);
LDLM_LOCK_PUT(dlmlock);
dlmlock = NULL;
if (obj == NULL && res->lr_type == LDLM_EXTENT) {
if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_SIZE_DATA))
break;
lock_res(res);
dlmlock = search_itree(res, &matchdata);
unlock_res(res);
}
}
unlock_res_and_lock(dlmlock);
if (obj != NULL) {
/* Do not grab the mutex of cl_lock for glimpse.
......
......@@ -2507,6 +2507,9 @@ int tgt_brw_write(struct tgt_session_info *tsi)
CFS_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
cfs_fail_val : (obd_timeout + 1) / 4);
/* Delay write commit to show stale size information */
CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_NO_SIZE_DATA, cfs_fail_val);
/* There must be big cache in current thread to process this request
* if it is NULL then something went wrong and it wasn't allocated,
* report -ENOMEM in that case */
......
......@@ -74,10 +74,13 @@
/* Name of file/directory. Will be set once and will not change. */
static char mainpath[PATH_MAX];
/* Path to same file/directory on second mount */
static char mainpath2[PATH_MAX];
static char *mainfile;
static char fsmountdir[PATH_MAX]; /* Lustre mountpoint */
static char *lustre_dir; /* Test directory inside Lustre */
static char *lustre_dir2; /* Same dir but on second mountpoint */
static int single_test; /* Number of a single test to execute*/
/* Cleanup our test file. */
......@@ -1077,9 +1080,125 @@ static int test22(void)
return 0;
}
/* Do lockahead requests from two mount points & sanity check size
*
* The key thing here is that client2 updates the size by writing, then asks
* for another lock beyond that. That next lock is never used.
* The bug (LU-11670) is that the glimpse for client1 will only check the
* highest lock and miss the size update made by the lower lock.
*/
static int test23(void)
{
struct llapi_lu_ladvise advice;
size_t write_size = 1024 * 1024;
char buf[write_size];
const int count = 1;
struct stat sb;
struct stat sb2;
int fd;
/* On second mount */
int fd2;
int rc;
int i;
fd = open(mainpath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
ASSERTF(fd >= 0, "open failed for '%s': %s",
mainpath, strerror(errno));
/* mainpath2 is a different Lustre mount */
fd2 = open(mainpath2, O_RDWR, S_IRUSR | S_IWUSR);
ASSERTF(fd2 >= 0, "open failed for '%s': %s",
mainpath2, strerror(errno));
/* Lock + write MiB 1 from second client */
setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, 0,
write_size - 1, true);
/* Manually set the result so we can verify it's being modified */
advice.lla_lockahead_result = 345678;
rc = llapi_ladvise(fd2, 0, count, &advice);
ASSERTF(rc == 0,
"cannot lockahead '%s': %s", mainpath2, strerror(errno));
ASSERTF(advice.lla_lockahead_result == 0,
"unexpected extent result: %d",
advice.lla_lockahead_result);
/* Ask again until we get the lock (status 1). */
for (i = 1; i < 100; i++) {
usleep(100000); /* 0.1 second */
advice.lla_lockahead_result = 456789;
rc = llapi_ladvise(fd2, 0, count, &advice);
ASSERTF(rc == 0, "cannot lockahead '%s': %s",
mainpath2, strerror(errno));
if (advice.lla_lockahead_result > 0)
break;
}
ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
"unexpected extent result: %d",
advice.lla_lockahead_result);
rc = write(fd2, buf, write_size);
ASSERTF(rc == sizeof(buf), "write failed for '%s': %s",
mainpath2, strerror(errno));
/* Lock (but don't write) MiB 2 from second client */
setup_ladvise_lockahead(&advice, MODE_WRITE_USER, 0, write_size,
2*write_size - 1, true);
/* Manually set the result so we can verify it's being modified */
advice.lla_lockahead_result = 345678;
rc = llapi_ladvise(fd2, 0, count, &advice);
ASSERTF(rc == 0,
"cannot lockahead '%s': %s", mainpath2, strerror(errno));
ASSERTF(advice.lla_lockahead_result == 0,
"unexpected extent result: %d",
advice.lla_lockahead_result);
/* Ask again until we get the lock (status 1). */
for (i = 1; i < 100; i++) {
usleep(100000); /* 0.1 second */
advice.lla_lockahead_result = 456789;
rc = llapi_ladvise(fd2, 0, count, &advice);
ASSERTF(rc == 0, "cannot lockahead '%s': %s",
mainpath2, strerror(errno));
if (advice.lla_lockahead_result > 0)
break;
}
ASSERTF(advice.lla_lockahead_result == LLA_RESULT_SAME,
"unexpected extent result: %d",
advice.lla_lockahead_result);
rc = fstat(fd, &sb);
ASSERTF(!rc, "stat failed for '%s': %s",
mainpath, strerror(errno));
rc = fstat(fd2, &sb2);
ASSERTF(!rc, "stat failed for '%s': %s",
mainpath2, strerror(errno));
ASSERTF(sb.st_size == sb2.st_size,
"size on %s and %s differs: %lu vs %lu",
mainpath, mainpath2, sb.st_size, sb2.st_size);
ASSERTF(sb.st_size == write_size, "size %lu != bytes written (%lu)",
sb.st_size, write_size);
close(fd);
close(fd2);
return 0;
}
static void usage(char *prog)
{
fprintf(stderr, "Usage: %s [-d lustre_dir], [-t single_test]\n", prog);
fprintf(stderr,
"Usage: %s [-d lustre_dir], [-D lustre_dir2] [-t test]\n",
prog);
exit(-1);
}
......@@ -1087,7 +1206,7 @@ static void process_args(int argc, char *argv[])
{
int c;
while ((c = getopt(argc, argv, "d:f:t:")) != -1) {
while ((c = getopt(argc, argv, "d:D:f:t:")) != -1) {
switch (c) {
case 'f':
mainfile = optarg;
......@@ -1095,6 +1214,9 @@ static void process_args(int argc, char *argv[])
case 'd':
lustre_dir = optarg;
break;
case 'D':
lustre_dir2 = optarg;
break;
case 't':
single_test = atoi(optarg);
break;
......@@ -1125,6 +1247,16 @@ int main(int argc, char *argv[])
return -1;
}
if (lustre_dir2) {
rc = llapi_search_mounts(lustre_dir2, 0, fsmountdir, fsname);
if (rc != 0) {
fprintf(stderr,
"Error: '%s': not a Lustre filesystem\n",
lustre_dir2);
return -1;
}
}
/* Play nice with Lustre test scripts. Non-line buffered output
* stream under I/O redirection may appear incorrectly. */
setvbuf(stdout, NULL, _IOLBF, 0);
......@@ -1133,6 +1265,14 @@ int main(int argc, char *argv[])
rc = snprintf(mainpath, sizeof(mainpath), "%s/%s", lustre_dir,
mainfile);
ASSERTF(rc > 0 && rc < sizeof(mainpath), "invalid name for mainpath");
if (lustre_dir2) {
rc = snprintf(mainpath2, sizeof(mainpath2), "%s/%s",
lustre_dir2, mainfile);
ASSERTF(rc > 0 && rc < sizeof(mainpath2),
"invalid name for mainpath2");
}
cleanup();
switch (single_test) {
......@@ -1150,6 +1290,9 @@ int main(int argc, char *argv[])
PERFORM(test20);
PERFORM(test21);
PERFORM(test22);
/* Some tests require a second mount point */
if (lustre_dir2)
PERFORM(test23);
/* When running all the test cases, we can't use the return
* from the last test case, as it might be non-zero to return
* info, rather than for an error. Test cases assert and exit
......@@ -1195,6 +1338,11 @@ int main(int argc, char *argv[])
case 22:
PERFORM(test22);
break;
case 23:
ASSERTF(lustre_dir2,
"must provide second mount point for test 23");
PERFORM(test23);
break;
default:
fprintf(stderr, "impossible value of single_test %d\n",
single_test);
......
......@@ -17100,7 +17100,7 @@ run_test 255b "check 'lfs ladvise -a dontneed'"
 
test_255c() {
[ $OST1_VERSION -lt $(version_code 2.10.50) ] &&
skip "lustre < 2.10.53 does not support lockahead"
skip "lustre < 2.10.50 does not support lockahead"
 
local count
local new_count
......
......@@ -4592,6 +4592,57 @@ test_102() {
}
run_test 102 "Test open by handle of unlinked file"
# Compare file size between first & second mount, ensuring the client correctly
# glimpses even with unused speculative locks - LU-11670
test_103() {
[ $(lustre_version_code $ost1) -lt $(version_code 2.10.50) ] &&
skip "Lockahead needs OST version at least 2.10.50"
local testnum=23
test_mkdir -p $DIR/$tdir
# Force file on to OST0
$LFS setstripe -i 0 $DIR/$tdir
# Do not check multiple locks on glimpse
# OBD_FAIL_OSC_NO_SIZE_DATA 0x415
$LCTL set_param fail_loc=0x415
# Delay write commit by 2 seconds to guarantee glimpse wins race
# The same fail_loc is used on client & server so it can work in the
# single node sanity setup
do_facet ost1 $LCTL set_param fail_loc=0x415 fail_val=2
echo "Incorrect size expected (no glimpse fix):"
lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
rc=$?
if [ $rc -eq 0 ]; then
error "Lockahead test $testnum passed with fail_loc set, ${rc}"
fi
# guarantee write commit timeout has expired
sleep 2
# Clear fail_loc on client
$LCTL set_param fail_loc=0
# Delay write commit by 2 seconds to guarantee glimpse wins race
# OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
do_facet ost1 $LCTL set_param fail_loc=0x214 fail_val=2
# Write commit is still delayed by 2 seconds
lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
rc=$?
[ $rc -eq 0 ] || error "Lockahead test${testnum} failed, ${rc}"
# guarantee write commit timeout has expired
sleep 2
rm -f $DIR/$tfile || error "unable to delete $DIR/$tfile"
}
run_test 103 "Test size correctness with lockahead"
log "cleanup: ======================================================"
# kill and wait in each test only guarentee script finish, but command in script
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment