diff --git a/lustre/ChangeLog b/lustre/ChangeLog index fa1832159c6fd4a772a22ff97a9bbb80980a5cd4..777e438446de3c0ad50991ccd2429731de1bc5a2 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -580,6 +580,15 @@ Frequency : rare Description: Using wrong pointer in osc_brw_prep_request Details : Access to array[-1] can produce panic if kernel compiled with CONFIG_PAGE_ALLOC enabled + +Severity : enhancement +Bugzilla : 4900 +Description: Async OSC create to avoid the blocking unnecessarily. +Details : If a OST has no remain object, system will block on the creating + when need to create a new object on this OST. Now, ways use + pre-created objects when available, instead of blocking on an + empty osc while others are not empty. If we must block, we block + for the shortest possible period of time. -------------------------------------------------------------------------------- diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 7484762a343fc7b42744c55fb28853e1f4345a8f..83d0c52633a55cf519e119b0012fff31340e343c 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -858,6 +858,7 @@ struct obd_ops { struct lov_stripe_md *mem_tgt); int (*o_preallocate)(struct lustre_handle *, obd_count *req, obd_id *ids); + int (*o_precreate)(struct obd_export *exp, int need_create); int (*o_create)(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti); int (*o_destroy)(struct obd_export *exp, struct obdo *oa, diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index d9deddb691cbe37d67396132b605777aabfea8fb..cfb217eab271236c5413867dbd2f70471ca21e4b 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -476,6 +476,18 @@ static inline int obd_checkmd(struct obd_export *exp, RETURN(rc); } +static inline int obd_precreate(struct obd_export *exp, int need_create) +{ + int rc; + ENTRY; + + EXP_CHECK_OP(exp, precreate); + OBD_COUNTER_INCREMENT(exp->exp_obd, precreate); + + rc = OBP(exp->exp_obd, precreate)(exp, need_create); + RETURN(rc); +} + static inline int obd_create(struct obd_export *exp, struct obdo *obdo, struct lov_stripe_md **ea, struct obd_trans_info *oti) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 5f987d4399fab96ab27ce492f59b7475cd9769df..40aa2b3f2d14cc43676b6283a8dfdca13c59f31d 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -103,6 +103,7 @@ extern int obd_race_state; #define OBD_FAIL_MDS_RESEND 0x136 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 #define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 +#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 85c80cd10b31fc7166b5596e89802e234261d847..565cc6f1c09eef1299f7d63b8efbb14ece617150 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -144,6 +144,8 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off); /* lov_qos.c */ +#define LOV_USES_ASSIGNED_STRIPE 0 +#define LOV_USES_DEFAULT_STRIPE 1 int qos_add_tgt(struct obd_device *obd, __u32 index); int qos_del_tgt(struct obd_device *obd, __u32 index); void qos_shrink_lsm(struct lov_request_set *set); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 7f5970c9e78f0ae72797f1ff97c5206048777b2b..0167df50e9e04ff2168fc954aeece3d412ae12f2 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -513,7 +513,7 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lov->lov_tgts, lov->lov_tgt_size); - } + } OBD_ALLOC_PTR(tgt); diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 2183ca27f9de413a3b666e4bacd95700cede6ab6..f0d3eb7935580d30932dc2e0191fe89652114ba5 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -485,24 +485,31 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req) #define LOV_CREATE_RESEED_MULT 4 #define LOV_CREATE_RESEED_MIN 1000 /* Allocate objects on osts with round-robin algorithm */ -static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt) +static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt_orig, int flags) { - unsigned array_idx, ost_count = lov->desc.ld_tgt_count; + unsigned array_idx, array_idx_temp, ost_count = lov->desc.ld_tgt_count; unsigned ost_active_count = lov->desc.ld_active_tgt_count; - int i, *idx_pos = idx_arr; + int i, *idx_pos; __u32 ost_idx; + int first_pass = 1, ost_start_idx_temp; + int want_level = 0; + int stripe_cnt; ENTRY; i = qos_calc_rr(lov); if (i) RETURN(i); + stripe_cnt = flags == LOV_USES_DEFAULT_STRIPE ? + *stripe_cnt_orig - (*stripe_cnt_orig/4) : + *stripe_cnt_orig; + if (--lov->lov_start_count <= 0) { lov->lov_start_idx = ll_rand() % ost_count; lov->lov_start_count = (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) + LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U); - } else if (*stripe_cnt >= ost_active_count || + } else if (stripe_cnt >= ost_active_count || lov->lov_start_idx > ost_count) { /* If we have allocated from all of the OSTs, slowly precess the next start */ @@ -512,11 +519,16 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt) array_idx = (lov->lov_start_idx + lov->lov_offset_idx) % ost_count; #ifdef QOS_DEBUG CDEBUG(D_QOS, "want %d startidx %d startcnt %d offset %d arrayidx %d\n", - *stripe_cnt, lov->lov_start_idx, lov->lov_start_count, + stripe_cnt, lov->lov_start_idx, lov->lov_start_count, lov->lov_offset_idx, array_idx); #endif - down_read(&lov->lov_qos.lq_rw_sem); + ost_start_idx_temp = lov->lov_start_idx; + array_idx_temp = array_idx; + +repeat_find : + idx_pos = idx_arr; + for (i = 0; i < ost_count; i++, array_idx=(array_idx + 1) % ost_count) { ++lov->lov_start_idx; ost_idx = lov->lov_qos.lq_rr_array[array_idx]; @@ -527,18 +539,37 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt) lov->lov_tgts[ost_idx]->ltd_active : 0, idx_pos - idx_arr, array_idx, ost_idx); #endif - if ((ost_idx == LOV_QOS_EMPTY) || !lov->lov_tgts[ost_idx] || + if ((ost_idx == LOV_QOS_EMPTY) || !lov->lov_tgts[ost_idx] || !lov->lov_tgts[ost_idx]->ltd_active) continue; + + /* Fail Check before osc_precreate() is called + so we can only 'fail' single OSC. */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0) + continue; + + /* the osc_precreate() will be called */ + if (obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp, first_pass) > want_level) + continue; + *idx_pos = ost_idx; idx_pos++; /* We have enough stripes */ - if (idx_pos - idx_arr == *stripe_cnt) + if (idx_pos - idx_arr == *stripe_cnt_orig) break; } + if (first_pass && (idx_pos - idx_arr < stripe_cnt)) { + /* not send precreate and skip only failed ost */ + first_pass = 0; + want_level = 1; + lov->lov_start_idx = ost_start_idx_temp; + array_idx = array_idx_temp; + goto repeat_find; + } + up_read(&lov->lov_qos.lq_rw_sem); - *stripe_cnt = idx_pos - idx_arr; + *stripe_cnt_orig = idx_pos - idx_arr; RETURN(0); } @@ -548,20 +579,39 @@ static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm, { unsigned ost_idx, ost_count = lov->desc.ld_tgt_count; int i, *idx_pos = idx_arr; + int first_pass = 1; + int want_level = 0; ENTRY; +repeat_find: ost_idx = lsm->lsm_oinfo[0]->loi_ost_idx; for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { if (!lov->lov_tgts[ost_idx] || !lov->lov_tgts[ost_idx]->ltd_active) { continue; } + + /* Fail Check before osc_precreate() is called + so we can only 'fail' single OSC. */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0) + continue; + + /* the osc_precreate() will be called */ + if (obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp, first_pass) > want_level) + continue; + *idx_pos = ost_idx; idx_pos++; /* got enough ost */ if (idx_pos - idx_arr == lsm->lsm_stripe_count) RETURN(0); } + if (first_pass) { + first_pass = 0; + want_level = 1; + goto repeat_find; + } + /* If we were passed specific striping params, then a failure to * meet those requirements is an error, since we can't reallocate * that memory (it might be part of a larger array or something). @@ -577,7 +627,8 @@ static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm, - free space - network resources (shared OSS's) */ -static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt) +static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, + int flags) { struct lov_obd *lov = &exp->exp_obd->u.lov; static time_t last_warn = 0; @@ -628,6 +679,14 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt) continue; } + /* Fail Check before osc_precreate() is called + so we can only 'fail' single OSC. */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && i == 0) + continue; + + if (obd_precreate(lov->lov_tgts[i]->ltd_exp, 1) == 2) + continue; + lov->lov_tgts[i]->ltd_qos.ltq_usable = 1; qos_calc_weight(lov, i); total_bavail += bavail; @@ -685,6 +744,7 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt) if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_qos.ltq_usable) continue; + cur_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight; if (cur_weight >= rand) { #ifdef QOS_DEBUG @@ -709,7 +769,7 @@ out: up_write(&lov->lov_qos.lq_rw_sem); if (rc == -EAGAIN) - rc = alloc_rr(lov, idx_arr, stripe_cnt); + rc = alloc_rr(lov, idx_arr, stripe_cnt, flags); lov_putref(exp->exp_obd); RETURN(rc); @@ -717,7 +777,7 @@ out: /* return new alloced stripe count on success */ static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm, - int newea, int **idx_arr, int *arr_cnt) + int newea, int **idx_arr, int *arr_cnt, int flags) { struct lov_obd *lov = &exp->exp_obd->u.lov; int stripe_cnt = lsm->lsm_stripe_count; @@ -734,7 +794,7 @@ static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm, if (newea || lsm->lsm_oinfo[0]->loi_ost_idx >= lov->desc.ld_tgt_count) - rc = alloc_qos(exp, tmp_arr, &stripe_cnt); + rc = alloc_qos(exp, tmp_arr, &stripe_cnt, flags); else rc = alloc_specific(lov, lsm, tmp_arr); @@ -762,13 +822,14 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) struct obdo *src_oa = set->set_oi->oi_oa; struct obd_trans_info *oti = set->set_oti; int i, stripes, rc = 0, newea = 0; + int flag = LOV_USES_ASSIGNED_STRIPE; int *idx_arr, idx_cnt = 0; ENTRY; LASSERT(src_oa->o_valid & OBD_MD_FLID); if (set->set_oi->oi_md == NULL) { - int stripe_cnt = lov_get_stripecnt(lov, 0); + int stripes_def = lov_get_stripecnt(lov, 0); /* If the MDS file was truncated up to some size, stripe over * enough OSTs to allow the file to be created at that size. @@ -791,10 +852,11 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) } lov_putref(exp->exp_obd); - if (stripes < stripe_cnt) - stripes = stripe_cnt; + if (stripes < stripes_def) + stripes = stripes_def; } else { - stripes = stripe_cnt; + flag = LOV_USES_DEFAULT_STRIPE; + stripes = stripes_def; } rc = lov_alloc_memmd(&set->set_oi->oi_md, stripes, @@ -803,8 +865,8 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) LOV_MAGIC); if (rc < 0) GOTO(out_err, rc); - rc = 0; newea = 1; + rc = 0; } lsm = set->set_oi->oi_md; @@ -816,7 +878,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) lsm->lsm_pattern = lov->desc.ld_pattern; } - stripes = alloc_idx_array(exp, lsm, newea, &idx_arr, &idx_cnt); + stripes = alloc_idx_array(exp, lsm, newea, &idx_arr, &idx_cnt, flag); if (stripes <= 0) GOTO(out_err, rc = stripes ? stripes : -EIO); LASSERTF(stripes <= lsm->lsm_stripe_count,"requested %d allocated %d\n", diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 29bc50821c8c2b2230d50bca444aa68b641f9a62..1b2ba7ac9a4c9ee03d207549b7dce67a000c796d 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -847,6 +847,7 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd); LPROCFS_OBD_OP_INIT(num_private_stats, stats, checkmd); LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate); LPROCFS_OBD_OP_INIT(num_private_stats, stats, create); LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy); LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index cc5b9619df12df45e79e47facc80fce4a1fe4c27..393c2bdb22a91cf54399747683dd9825df9fada5 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2798,6 +2798,8 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, LASSERT(down_trylock(&filter->fo_create_lock) != 0); + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_PRECREATE, obd_timeout / 2); + if ((oa->o_valid & OBD_MD_FLFLAGS) && (oa->o_flags & OBD_FL_RECREATE_OBJS)) { recreate_obj = 1; diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 8a5a0ed20b6d3b832c87dec12603af1460f78597..e8fc4a9349d1f9f8421bc97e86d33c17c3b383a6 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -239,6 +239,39 @@ int oscc_recovering(struct osc_creator *oscc) return recov; } +/* decide if the OST has remaining object, return value : + 0 : the OST has remaining object, and don't need to do precreate. + 1 : the OST has no remaining object, and will send a RPC for precreate. + 2 : the OST has no remaining object, and will not get any for + a potentially very long time + */ +int osc_precreate(struct obd_export *exp, int need_create) +{ + struct osc_creator *oscc = &exp->exp_obd->u.cli.cl_oscc; + struct obd_import *imp = exp->exp_imp_reverse; + ENTRY; + + LASSERT(oscc != NULL); + if (imp != NULL && imp->imp_deactive) + RETURN(2); + + if (oscc->oscc_last_id < oscc->oscc_next_id) { + if (oscc->oscc_flags & OSCC_FLAG_NOSPC || + oscc_recovering(oscc)) + RETURN(2); + + if (oscc->oscc_flags & OSCC_FLAG_CREATING) + RETURN(1); + + if (!need_create) + RETURN(1); + + oscc_internal_create(oscc); + RETURN(1); + } + RETURN(0); +} + int osc_create(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti) { diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 7fd68ba31604cc2b3960f2fd79cda57c3db1a6a1..4ae97206397e2249d5c1eccb50de39b8ad93c8f7 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -54,6 +54,7 @@ struct osc_cache_waiter { #define OSCC_FLAG_LOW 0x10 #define OSCC_FLAG_EXITING 0x20 +int osc_precreate(struct obd_export *exp, int need_create); int osc_create(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti); int osc_real_create(struct obd_export *exp, struct obdo *oa, diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 3868c5e56a660baf62d85e2d3520314c12806bb5..9c9adc459dc58174a0010560c86bf527abce662b 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3666,6 +3666,7 @@ struct obd_ops osc_obd_ops = { .o_statfs_async = osc_statfs_async, .o_packmd = osc_packmd, .o_unpackmd = osc_unpackmd, + .o_precreate = osc_precreate, .o_create = osc_create, .o_destroy = osc_destroy, .o_getattr = osc_getattr, diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a55f51b144b41d9a704be6ad6a11c7f292cb629b..e88f38f435ba499210b5c7c0e68c85f2ea2447b5 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1141,6 +1141,48 @@ test_27x() { # bug 10997 } run_test 27x "check lfs setstripe -c -s -i options =============" +test_27u() { # bug 4900 + [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && echo "skip $TESTNAME" && return + #define OBD_FAIL_MDS_OSC_PRECREATE 0x139 + + sysctl -w lustre.fail_loc=0x139 + mkdir -p $DIR/d27u + createmany -o $DIR/d27u/t- 1000 + sysctl -w lustre.fail_loc=0 + + $LFS getstripe $DIR/d27u > $TMP/files + OBJS=`cat $TMP/files | awk -vobjs=0 '($1 == 0) { objs += 1 } END { print objs;}'` + unlinkmany $DIR/d27u/t- 1000 + [ $OBJS -gt 0 ] && \ + error "Found $OBJS objects were created on OST-0" || pass +} +run_test 27u "skip object creation on OSC w/o objects ==========" + +test_27v() { # bug 4900 + [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && echo "skip $TESTNAME" && return + exhaust_all_precreations + + mkdir -p $DIR/$tdir + lfs setstripe $DIR/$tdir 0 -1 1 # 1 stripe / file + + touch $DIR/$tdir/$tfile + #define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 + sysctl -w lustre.fail_loc=0x705 + START=`date +%s` + for F in `seq 1 32`; do + touch $DIR/$tdir/$tfile.$F + done + sysctl -w lustre.fail_loc=0 + + FINISH=`date +%s` + TIMEOUT=`sysctl -n lustre.timeout` + [ $((FINISH - START)) -ge $((TIMEOUT / 2)) ] && \ + error "$FINISH - $START >= $TIMEOUT / 2" + + reset_enospc +} +run_test 27v "skip object creation on slow OST =================" + test_28() { mkdir $DIR/d28 $CREATETEST $DIR/d28/ct || error @@ -2670,7 +2712,7 @@ test_65j() { # bug6367 cleanup -f || error "failed to unmount" setup fi - $SETSTRIPE -d $MOUNT + $SETSTRIPE -d $MOUNT || error "setstripe failed" } run_test 65j "set default striping on root directory (bug 6367)="