cluster/tier: add watermarks and policy driver
[obnox/glusterfs.git] / xlators / cluster / dht / src / dht-shared.c
1 /*
2   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
3   This file is part of GlusterFS.
4
5   This file is licensed to you under your choice of the GNU Lesser
6   General Public License, version 3 or any later version (LGPLv3 or
7   later), or the GNU General Public License, version 2 (GPLv2), in all
8   cases as published by the Free Software Foundation.
9 */
10
11
12 /* TODO: add NS locking */
13 #include "statedump.h"
14 #include "dht-common.h"
15 #include "dht-messages.h"
16
17 #ifndef MAX
18 #define MAX(a, b) (((a) > (b))?(a):(b))
19 #endif
20
21 #define GF_DECIDE_DEFRAG_THROTTLE_COUNT(throttle_count, conf) {         \
22                                                                         \
23                 pthread_mutex_lock (&conf->defrag->dfq_mutex);          \
24                                                                         \
25                 if (!strcasecmp (conf->dthrottle, "lazy"))              \
26                         conf->defrag->recon_thread_count = 1;           \
27                                                                         \
28                 throttle_count =                                        \
29                     MAX ((sysconf(_SC_NPROCESSORS_ONLN) - 4), 4);       \
30                                                                         \
31                 if (!strcasecmp (conf->dthrottle, "normal"))            \
32                         conf->defrag->recon_thread_count =              \
33                                                  (throttle_count / 2);  \
34                                                                         \
35                 if (!strcasecmp (conf->dthrottle, "aggressive"))        \
36                         conf->defrag->recon_thread_count =              \
37                                                  throttle_count;        \
38                                                                         \
39                 pthread_mutex_unlock (&conf->defrag->dfq_mutex);        \
40         }                                                               \
41
42 /* TODO:
43    - use volumename in xattr instead of "dht"
44    - use NS locks
45    - handle all cases in self heal layout reconstruction
46    - complete linkfile selfheal
47 */
48 struct volume_options options[];
49
50 extern dht_methods_t dht_methods;
51
52 void
53 dht_layout_dump (dht_layout_t  *layout, const char *prefix)
54 {
55
56         char    key[GF_DUMP_MAX_BUF_LEN];
57         int     i = 0;
58
59         if (!layout)
60                 goto out;
61         if (!prefix)
62                 goto out;
63
64         gf_proc_dump_build_key(key, prefix, "cnt");
65         gf_proc_dump_write(key, "%d", layout->cnt);
66         gf_proc_dump_build_key(key, prefix, "preset");
67         gf_proc_dump_write(key, "%d", layout->preset);
68         gf_proc_dump_build_key(key, prefix, "gen");
69         gf_proc_dump_write(key, "%d", layout->gen);
70         if (layout->type != IA_INVAL) {
71                 gf_proc_dump_build_key(key, prefix, "inode type");
72                 gf_proc_dump_write(key, "%d", layout->type);
73         }
74
75         if  (!IA_ISDIR (layout->type))
76                 goto out;
77
78         for (i = 0; i < layout->cnt; i++) {
79                 gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
80                 gf_proc_dump_write(key, "%d", layout->list[i].err);
81                 gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
82                 gf_proc_dump_write(key, "%u", layout->list[i].start);
83                 gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
84                 gf_proc_dump_write(key, "%u", layout->list[i].stop);
85                 if (layout->list[i].xlator) {
86                         gf_proc_dump_build_key(key, prefix,
87                                                "list[%d].xlator.type", i);
88                         gf_proc_dump_write(key, "%s",
89                                            layout->list[i].xlator->type);
90                         gf_proc_dump_build_key(key, prefix,
91                                                "list[%d].xlator.name", i);
92                         gf_proc_dump_write(key, "%s",
93                                            layout->list[i].xlator->name);
94                 }
95         }
96
97 out:
98         return;
99 }
100
101
102 int32_t
103 dht_priv_dump (xlator_t *this)
104 {
105         char            key_prefix[GF_DUMP_MAX_BUF_LEN];
106         char            key[GF_DUMP_MAX_BUF_LEN];
107         int             i = 0;
108         dht_conf_t      *conf = NULL;
109         int             ret = -1;
110
111         if (!this)
112                 goto out;
113
114         conf = this->private;
115         if (!conf)
116                 goto out;
117
118         ret = TRY_LOCK(&conf->subvolume_lock);
119         if (ret != 0) {
120                 return ret;
121         }
122
123         gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
124         gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
125                                this->name);
126         gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
127         for (i = 0; i < conf->subvolume_cnt; i++) {
128                 snprintf (key, sizeof (key), "subvolumes[%d]", i);
129                 gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
130                                    conf->subvolumes[i]->name);
131                 if (conf->file_layouts && conf->file_layouts[i]){
132                         snprintf (key, sizeof (key), "file_layouts[%d]", i);
133                         dht_layout_dump(conf->file_layouts[i], key);
134                 }
135                 if (conf->dir_layouts && conf->dir_layouts[i]) {
136                         snprintf (key, sizeof (key), "dir_layouts[%d]", i);
137                         dht_layout_dump(conf->dir_layouts[i], key);
138                 }
139                 if (conf->subvolume_status) {
140
141                         snprintf (key, sizeof (key), "subvolume_status[%d]", i);
142                         gf_proc_dump_write(key, "%d",
143                                            (int)conf->subvolume_status[i]);
144                 }
145
146         }
147
148         gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
149         gf_proc_dump_write("gen", "%d", conf->gen);
150         gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
151         gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
152         gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
153         gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
154         gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
155         gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp);
156
157         if (conf->du_stats && conf->subvolume_status) {
158                 for (i = 0; i < conf->subvolume_cnt; i++) {
159                         if (!conf->subvolume_status[i])
160                                 continue;
161
162                         snprintf (key, sizeof (key), "subvolumes[%d]", i);
163                         gf_proc_dump_write (key, "%s",
164                                             conf->subvolumes[i]->name);
165
166                         snprintf (key, sizeof (key),
167                                   "du_stats[%d].avail_percent", i);
168                         gf_proc_dump_write (key, "%lf",
169                                             conf->du_stats[i].avail_percent);
170
171                         snprintf (key, sizeof (key), "du_stats[%d].avail_space",
172                                   i);
173                         gf_proc_dump_write (key, "%lu",
174                                             conf->du_stats[i].avail_space);
175
176                         snprintf (key, sizeof (key),
177                                   "du_stats[%d].avail_inodes", i);
178                         gf_proc_dump_write (key, "%lf",
179                                             conf->du_stats[i].avail_inodes);
180
181                         snprintf (key, sizeof (key), "du_stats[%d].log", i);
182                         gf_proc_dump_write (key, "%lu",
183                                             conf->du_stats[i].log);
184                 }
185         }
186
187         if (conf->last_stat_fetch.tv_sec)
188                 gf_proc_dump_write("last_stat_fetch", "%s",
189                                     ctime(&conf->last_stat_fetch.tv_sec));
190
191         UNLOCK(&conf->subvolume_lock);
192
193 out:
194         return ret;
195 }
196
197 int32_t
198 dht_inodectx_dump (xlator_t *this, inode_t *inode)
199 {
200         int             ret = -1;
201         dht_layout_t    *layout = NULL;
202
203         if (!this)
204                 goto out;
205         if (!inode)
206                 goto out;
207
208         ret = dht_inode_ctx_layout_get (inode, this, &layout);
209
210         if ((ret != 0) || !layout)
211                 return ret;
212
213         gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
214         dht_layout_dump(layout, "layout");
215
216 out:
217         return ret;
218 }
219
220 void
221 dht_fini (xlator_t *this)
222 {
223         int         i = 0;
224         dht_conf_t *conf = NULL;
225
226         GF_VALIDATE_OR_GOTO ("dht", this, out);
227
228         conf = this->private;
229         this->private = NULL;
230         if (conf) {
231                 if (conf->file_layouts) {
232                         for (i = 0; i < conf->subvolume_cnt; i++) {
233                                 GF_FREE (conf->file_layouts[i]);
234                         }
235                         GF_FREE (conf->file_layouts);
236                 }
237
238                 dict_destroy(conf->leaf_to_subvol);
239
240                 GF_FREE (conf->subvolumes);
241
242                 GF_FREE (conf->subvolume_status);
243
244                 if (conf->lock_pool)
245                         mem_pool_destroy (conf->lock_pool);
246
247                 GF_FREE (conf);
248         }
249 out:
250         return;
251 }
252
253 int32_t
254 mem_acct_init (xlator_t *this)
255 {
256         int     ret = -1;
257
258         GF_VALIDATE_OR_GOTO ("dht", this, out);
259
260         ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
261
262         if (ret != 0) {
263                 gf_msg (this->name, GF_LOG_ERROR, 0,
264                         DHT_MSG_NO_MEMORY,
265                         "Memory accounting init failed");
266                 return ret;
267         }
268 out:
269         return ret;
270 }
271
272
273 int
274 dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
275                                  const char *bricks)
276 {
277         int         i  = 0;
278         int         ret  = -1;
279         char       *tmpstr = NULL;
280         char       *dup_brick = NULL;
281         char       *node = NULL;
282
283         if (!conf || !bricks)
284                 goto out;
285
286         dup_brick = gf_strdup (bricks);
287         node = strtok_r (dup_brick, ",", &tmpstr);
288         while (node) {
289                 for (i = 0; i < conf->subvolume_cnt; i++) {
290                         if (!strcmp (conf->subvolumes[i]->name, node)) {
291                                 conf->decommissioned_bricks[i] =
292                                         conf->subvolumes[i];
293                                         conf->decommission_subvols_cnt++;
294                                 gf_msg (this->name, GF_LOG_INFO, 0,
295                                         DHT_MSG_SUBVOL_DECOMMISSION_INFO,
296                                         "decommissioning subvolume %s",
297                                         conf->subvolumes[i]->name);
298                                 break;
299                         }
300                 }
301                 if (i == conf->subvolume_cnt) {
302                         /* Wrong node given. */
303                         goto out;
304                 }
305                 node = strtok_r (NULL, ",", &tmpstr);
306         }
307
308         ret = 0;
309         conf->decommission_in_progress = 1;
310 out:
311         GF_FREE (dup_brick);
312
313         return ret;
314 }
315
316 int
317 dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf)
318 {
319         int         i  = 0;
320         int         ret  = -1;
321
322         if (!conf)
323                 goto out;
324
325         for (i = 0; i < conf->subvolume_cnt; i++) {
326                 if (conf->decommissioned_bricks[i]) {
327                         conf->decommissioned_bricks[i] = NULL;
328                         conf->decommission_subvols_cnt--;
329                 }
330         }
331
332         ret = 0;
333 out:
334
335         return ret;
336 }
337 void
338 dht_init_regex (xlator_t *this, dict_t *odict, char *name,
339                 regex_t *re, gf_boolean_t *re_valid)
340 {
341         char    *temp_str;
342
343         if (dict_get_str (odict, name, &temp_str) != 0) {
344                 if (strcmp(name,"rsync-hash-regex")) {
345                         return;
346                 }
347                 temp_str = "^\\.(.+)\\.[^.]+$";
348         }
349
350         if (*re_valid) {
351                 regfree(re);
352                 *re_valid = _gf_false;
353         }
354
355         if (!strcmp(temp_str,"none")) {
356                 return;
357         }
358
359         if (regcomp(re,temp_str,REG_EXTENDED) == 0) {
360                 gf_msg_debug (this->name, 0,
361                               "using regex %s = %s", name, temp_str);
362                 *re_valid = _gf_true;
363         }
364         else {
365                 gf_msg (this->name, GF_LOG_WARNING, 0,
366                         DHT_MSG_REGEX_INFO,
367                         "compiling regex %s failed", temp_str);
368         }
369 }
370
371 int
372 dht_set_subvol_range(xlator_t *this)
373 {
374         int ret = -1;
375         dht_conf_t *conf = NULL;
376
377         conf = this->private;
378
379         if (!conf)
380                 goto out;
381
382         conf->leaf_to_subvol = dict_new();
383         if (!conf->leaf_to_subvol)
384                 goto out;
385
386         ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol);
387
388 out:
389         return ret;
390 }
391
392 int
393 dht_reconfigure (xlator_t *this, dict_t *options)
394 {
395         dht_conf_t      *conf = NULL;
396         char            *temp_str = NULL;
397         gf_boolean_t     search_unhashed;
398         int              ret = -1;
399         int              throttle_count = 0;
400
401         GF_VALIDATE_OR_GOTO ("dht", this, out);
402         GF_VALIDATE_OR_GOTO ("dht", options, out);
403
404         conf = this->private;
405         if (!conf)
406                 return 0;
407
408         if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
409                 /* If option is not "auto", other options _should_ be boolean*/
410                 if (strcasecmp (temp_str, "auto")) {
411                         if (!gf_string2boolean (temp_str, &search_unhashed)) {
412                                 gf_msg_debug(this->name, 0, "Reconfigure: "
413                                              "lookup-unhashed reconfigured(%s)",
414                                              temp_str);
415                                 conf->search_unhashed = search_unhashed;
416                         } else {
417                                 gf_msg(this->name, GF_LOG_ERROR, 0,
418                                        DHT_MSG_INVALID_OPTION,
419                                        "Invalid option: Reconfigure: "
420                                        "lookup-unhashed should be boolean,"
421                                        " not (%s), defaulting to (%d)",
422                                        temp_str, conf->search_unhashed);
423                                 ret = -1;
424                                 goto out;
425                         }
426                 } else {
427                         gf_msg_debug(this->name, 0, "Reconfigure:"
428                                      " lookup-unhashed reconfigured auto ");
429                         conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
430                 }
431         }
432
433         GF_OPTION_RECONF ("lookup-optimize", conf->lookup_optimize, options,
434                           bool, out);
435
436         GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
437                           percent_or_size, out);
438         /* option can be any one of percent or bytes */
439         conf->disk_unit = 0;
440         if (conf->min_free_disk < 100.0)
441                 conf->disk_unit = 'p';
442
443         GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
444                           percent, out);
445
446         GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
447                           options, uint32, out);
448
449         GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
450                           bool, out);
451         GF_OPTION_RECONF ("randomize-hash-range-by-gfid",
452                           conf->randomize_by_gfid,
453                           options, bool, out);
454
455         GF_OPTION_RECONF ("rebal-throttle", conf->dthrottle, options,
456                           str, out);
457
458         if (conf->defrag) {
459                 GF_DECIDE_DEFRAG_THROTTLE_COUNT (throttle_count, conf);
460                 gf_msg ("DHT", GF_LOG_INFO, 0,
461                         DHT_MSG_REBAL_THROTTLE_INFO,
462                         "conf->dthrottle: %s, "
463                         "conf->defrag->recon_thread_count: %d",
464                          conf->dthrottle, conf->defrag->recon_thread_count);
465         }
466
467         if (conf->defrag) {
468                 GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
469                                   options, bool, out);
470         }
471
472         if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
473                 ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
474                 if (ret == -1)
475                         goto out;
476         } else {
477                 ret = dht_decommissioned_remove (this, conf);
478                 if (ret == -1)
479                         goto out;
480         }
481
482         dht_init_regex (this, options, "rsync-hash-regex",
483                         &conf->rsync_regex, &conf->rsync_regex_valid);
484         dht_init_regex (this, options, "extra-hash-regex",
485                         &conf->extra_regex, &conf->extra_regex_valid);
486
487         GF_OPTION_RECONF ("weighted-rebalance", conf->do_weighting, options,
488                           bool, out);
489
490         GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options,
491                           bool, out);
492         ret = 0;
493 out:
494         return ret;
495 }
496
497 static int
498 gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data)
499 {
500         int                    ret = -1;
501         char                  *tmp_str = NULL;
502         char                  *tmp_str1 = NULL;
503         char                  *dup_str = NULL;
504         char                  *num = NULL;
505         char                  *pattern_str = NULL;
506         char                  *pattern = NULL;
507         gf_defrag_pattern_list_t *temp_list = NULL;
508         gf_defrag_pattern_list_t *pattern_list = NULL;
509
510         if (!this || !defrag || !data)
511                 goto out;
512
513         /* Get the pattern for pattern list. "pattern:<optional-size>"
514          * eg: *avi, *pdf:10MB, *:1TB
515          */
516         pattern_str = strtok_r (data, ",", &tmp_str);
517         while (pattern_str) {
518                 dup_str = gf_strdup (pattern_str);
519                 pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t),
520                                         1);
521                 if (!pattern_list) {
522                         goto out;
523                 }
524                 pattern = strtok_r (dup_str, ":", &tmp_str1);
525                 num = strtok_r (NULL, ":", &tmp_str1);
526                 if (!pattern)
527                         goto out;
528                 if (!num) {
529                         if (gf_string2bytesize_uint64(pattern, &pattern_list->size)
530                              == 0) {
531                                 pattern = "*";
532                         }
533                 } else if (gf_string2bytesize_uint64 (num, &pattern_list->size) != 0) {
534                         gf_msg (this->name, GF_LOG_ERROR, 0,
535                                 DHT_MSG_INVALID_OPTION,
536                                 "Invalid option. Defrag pattern:"
537                                 " Invalid number format \"%s\"", num);
538                         goto out;
539                 }
540                 memcpy (pattern_list->path_pattern, pattern, strlen (dup_str));
541
542                 if (!defrag->defrag_pattern)
543                         temp_list = NULL;
544                 else
545                         temp_list = defrag->defrag_pattern;
546
547                 pattern_list->next = temp_list;
548
549                 defrag->defrag_pattern = pattern_list;
550                 pattern_list = NULL;
551
552                 GF_FREE (dup_str);
553                 dup_str = NULL;
554
555                 pattern_str = strtok_r (NULL, ",", &tmp_str);
556         }
557
558         ret = 0;
559 out:
560         if (ret)
561                 GF_FREE (pattern_list);
562         GF_FREE (dup_str);
563
564         return ret;
565 }
566
567 int
568 dht_init (xlator_t *this)
569 {
570         dht_conf_t                      *conf           = NULL;
571         char                            *temp_str       = NULL;
572         int                              ret            = -1;
573         int                              i              = 0;
574         gf_defrag_info_t                *defrag         = NULL;
575         int                              cmd            = 0;
576         char                            *node_uuid      = NULL;
577         int                              throttle_count = 0;
578         uint32_t                         commit_hash    = 0;
579
580         GF_VALIDATE_OR_GOTO ("dht", this, err);
581
582         if (!this->children) {
583                 gf_msg (this->name, GF_LOG_CRITICAL, 0,
584                         DHT_MSG_INVALID_CONFIGURATION,
585                         "Distribute needs more than one subvolume");
586                 return -1;
587         }
588
589         if (!this->parents) {
590                 gf_msg (this->name, GF_LOG_WARNING, 0,
591                         DHT_MSG_INVALID_CONFIGURATION,
592                         "dangling volume. check volfile");
593         }
594
595         conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
596         if (!conf) {
597                 goto err;
598         }
599
600         /* We get the commit-hash to set only for rebalance process */
601         if (dict_get_uint32 (this->options,
602                              "commit-hash", &commit_hash) == 0) {
603                 gf_msg (this->name, GF_LOG_INFO, 0,
604                         DHT_MSG_COMMIT_HASH_INFO, "%s using commit hash %u",
605                         __func__, commit_hash);
606                 conf->vol_commit_hash = commit_hash;
607                 conf->vch_forced = _gf_true;
608         }
609
610         ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
611
612         if (cmd) {
613                 defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
614                                     gf_defrag_info_mt);
615
616                 GF_VALIDATE_OR_GOTO (this->name, defrag, err);
617
618                 LOCK_INIT (&defrag->lock);
619
620                 defrag->is_exiting = 0;
621
622                 conf->defrag = defrag;
623
624                 ret = dict_get_str (this->options, "node-uuid", &node_uuid);
625                 if (ret) {
626                         gf_msg (this->name, GF_LOG_ERROR, 0,
627                                 DHT_MSG_INVALID_CONFIGURATION,
628                                 "Invalid volume configuration: "
629                                 "node-uuid not specified");
630                         goto err;
631                 }
632
633                 if (gf_uuid_parse (node_uuid, defrag->node_uuid)) {
634                         gf_msg (this->name, GF_LOG_ERROR, 0,
635                                 DHT_MSG_INVALID_OPTION, "Invalid option:"
636                                 " Cannot parse glusterd node uuid");
637                         goto err;
638                 }
639
640                 defrag->cmd = cmd;
641
642                 defrag->stats = _gf_false;
643
644                 defrag->queue = NULL;
645
646                 defrag->crawl_done = 0;
647
648                 defrag->global_error = 0;
649
650                 defrag->q_entry_count = 0;
651
652                 defrag->wakeup_crawler = 0;
653
654                 synclock_init (&defrag->link_lock, SYNC_LOCK_DEFAULT);
655                 pthread_mutex_init (&defrag->dfq_mutex, 0);
656                 pthread_cond_init  (&defrag->parallel_migration_cond, 0);
657                 pthread_cond_init  (&defrag->rebalance_crawler_alarm, 0);
658                 pthread_cond_init  (&defrag->df_wakeup_thread, 0);
659
660                 defrag->global_error = 0;
661
662         }
663
664         conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
665         if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
666                 /* If option is not "auto", other options _should_ be boolean */
667                 if (strcasecmp (temp_str, "auto")) {
668                         ret = gf_string2boolean (temp_str,
669                                                  &conf->search_unhashed);
670                         if (ret == -1)
671                                 goto err;
672                 }
673                 else
674                         conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
675         }
676
677         GF_OPTION_INIT ("lookup-optimize", conf->lookup_optimize, bool, err);
678
679         GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
680                         err);
681
682         GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
683
684         GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
685                         err);
686
687         GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
688                         err);
689
690         conf->dir_spread_cnt = conf->subvolume_cnt;
691         GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
692                         uint32, err);
693
694         GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
695                         bool, err);
696
697         GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
698
699         if (defrag) {
700                 GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
701                 if (dict_get_str (this->options, "rebalance-filter", &temp_str)
702                     == 0) {
703                         if (gf_defrag_pattern_list_fill (this, defrag, temp_str)
704                             == -1) {
705                                 gf_msg (this->name, GF_LOG_ERROR, 0,
706                                         DHT_MSG_INVALID_OPTION,
707                                         "Invalid option:"
708                                         " Cannot parse rebalance-filter (%s)",
709                                         temp_str);
710
711                                 goto err;
712                         }
713                 }
714         }
715
716         /* option can be any one of percent or bytes */
717         conf->disk_unit = 0;
718         if (conf->min_free_disk < 100)
719                 conf->disk_unit = 'p';
720
721         ret = dht_init_subvolumes (this, conf);
722         if (ret == -1) {
723                 goto err;
724         }
725
726         if (cmd) {
727                 ret = dht_init_local_subvolumes (this, conf);
728                 if (ret) {
729                         gf_msg (this->name, GF_LOG_ERROR, 0,
730                                 DHT_MSG_INIT_LOCAL_SUBVOL_FAILED,
731                                 "dht_init_local_subvolumes failed");
732                         goto err;
733                 }
734         }
735
736         if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
737                 ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
738                 if (ret == -1)
739                         goto err;
740         }
741
742         dht_init_regex (this, this->options, "rsync-hash-regex",
743                         &conf->rsync_regex, &conf->rsync_regex_valid);
744         dht_init_regex (this, this->options, "extra-hash-regex",
745                         &conf->extra_regex, &conf->extra_regex_valid);
746
747         ret = dht_layouts_init (this, conf);
748         if (ret == -1) {
749                 goto err;
750         }
751
752         LOCK_INIT (&conf->subvolume_lock);
753         LOCK_INIT (&conf->layout_lock);
754
755         conf->gen = 1;
756
757         this->local_pool = mem_pool_new (dht_local_t, 512);
758         if (!this->local_pool) {
759                 gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
760                         DHT_MSG_NO_MEMORY,
761                         " DHT initialisation failed. "
762                         "failed to create local_t's memory pool");
763                 goto err;
764         }
765
766         GF_OPTION_INIT ("randomize-hash-range-by-gfid",
767                         conf->randomize_by_gfid, bool, err);
768
769         if (defrag) {
770                 GF_OPTION_INIT ("rebal-throttle",
771                                  conf->dthrottle, str, err);
772
773                 GF_DECIDE_DEFRAG_THROTTLE_COUNT(throttle_count, conf);
774
775                 gf_msg_debug ("DHT", 0, "conf->dthrottle: %s, "
776                               "conf->defrag->recon_thread_count: %d",
777                               conf->dthrottle,
778                               conf->defrag->recon_thread_count);
779         }
780
781         GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
782         gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,
783                      conf->xattr_name);
784         gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR,
785                      conf->xattr_name);
786         gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
787         if (!conf->link_xattr_name || !conf->wild_xattr_name) {
788                 goto err;
789         }
790
791         GF_OPTION_INIT ("weighted-rebalance", conf->do_weighting, bool, err);
792
793         conf->lock_pool = mem_pool_new (dht_lock_t, 512);
794         if (!conf->lock_pool) {
795                 gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED,
796                         "failed to create lock mem_pool, failing "
797                         "initialization");
798                 goto err;
799         }
800
801         this->private = conf;
802
803         if (dht_set_subvol_range(this))
804                 goto err;
805
806         conf->methods = &dht_methods;
807
808         return 0;
809
810 err:
811         if (conf) {
812                 if (conf->file_layouts) {
813                         for (i = 0; i < conf->subvolume_cnt; i++) {
814                                 GF_FREE (conf->file_layouts[i]);
815                         }
816                         GF_FREE (conf->file_layouts);
817                 }
818
819                 GF_FREE (conf->subvolumes);
820
821                 GF_FREE (conf->subvolume_status);
822
823                 GF_FREE (conf->du_stats);
824
825                 GF_FREE (conf->defrag);
826
827                 GF_FREE (conf->xattr_name);
828                 GF_FREE (conf->link_xattr_name);
829                 GF_FREE (conf->wild_xattr_name);
830
831                 if (conf->lock_pool)
832                         mem_pool_destroy (conf->lock_pool);
833
834                 GF_FREE (conf);
835         }
836
837         return -1;
838 }
839
840
841 struct volume_options options[] = {
842         { .key  = {"lookup-unhashed"},
843           .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
844                     "on", "off"},
845           .type = GF_OPTION_TYPE_STR,
846           .default_value = "on",
847           .description = "This option if set to ON, does a lookup through "
848           "all the sub-volumes, in case a lookup didn't return any result "
849           "from the hash subvolume. If set to OFF, it does not do a lookup "
850           "on the remaining subvolumes."
851         },
852         { .key = {"lookup-optimize"},
853           .type = GF_OPTION_TYPE_BOOL,
854           .default_value = "off",
855           .description = "This option if set to ON enables the optimization "
856           "of -ve lookups, by not doing a lookup on non-hashed subvolumes for "
857           "files, in case the hashed subvolume does not return any result. "
858           "This option disregards the lookup-unhashed setting, when enabled."
859         },
860         { .key  = {"min-free-disk"},
861           .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
862           .default_value = "10%",
863           .description = "Percentage/Size of disk space, after which the "
864           "process starts balancing out the cluster, and logs will appear "
865           "in log files",
866         },
867         { .key  = {"min-free-inodes"},
868           .type = GF_OPTION_TYPE_PERCENT,
869           .default_value = "5%",
870           .description = "after system has only N% of inodes, warnings "
871           "starts to appear in log files",
872         },
873         { .key = {"unhashed-sticky-bit"},
874           .type = GF_OPTION_TYPE_BOOL,
875           .default_value = "off",
876         },
877         { .key = {"use-readdirp"},
878           .type = GF_OPTION_TYPE_BOOL,
879           .default_value = "on",
880           .description = "This option if set to ON, forces the use of "
881           "readdirp, and hence also displays the stats of the files."
882         },
883         { .key = {"assert-no-child-down"},
884           .type = GF_OPTION_TYPE_BOOL,
885           .default_value = "off",
886           .description = "This option if set to ON, in the event of "
887           "CHILD_DOWN, will call exit."
888         },
889         { .key  = {"directory-layout-spread"},
890           .type = GF_OPTION_TYPE_INT,
891           .min  = 1,
892           .validate = GF_OPT_VALIDATE_MIN,
893           .description = "Specifies the directory layout spread. Takes number "
894                          "of subvolumes as default value."
895         },
896         { .key  = {"decommissioned-bricks"},
897           .type = GF_OPTION_TYPE_ANY,
898           .description = "This option if set to ON, decommissions "
899           "the brick, so that no new data is allowed to be created "
900           "on that brick."
901         },
902         { .key  = {"rebalance-cmd"},
903           .type = GF_OPTION_TYPE_INT,
904         },
905         { .key = {"commit-hash"},
906           .type = GF_OPTION_TYPE_INT,
907         },
908         { .key = {"node-uuid"},
909           .type = GF_OPTION_TYPE_STR,
910         },
911         { .key = {"rebalance-stats"},
912           .type = GF_OPTION_TYPE_BOOL,
913           .default_value = "off",
914           .description = "This option if set to ON displays and logs the "
915           " time taken for migration of each file, during the rebalance "
916           "process. If set to OFF, the rebalance logs will only display the "
917           "time spent in each directory."
918         },
919         { .key = {"readdir-optimize"},
920           .type = GF_OPTION_TYPE_BOOL,
921           .default_value = "off",
922           .description = "This option if set to ON enables the optimization "
923           "that allows DHT to requests non-first subvolumes to filter out "
924           "directory entries."
925         },
926         { .key = {"rsync-hash-regex"},
927           .type = GF_OPTION_TYPE_STR,
928           /* Setting a default here doesn't work.  See dht_init_regex. */
929           .description = "Regular expression for stripping temporary-file "
930           "suffix and prefix used by rsync, to prevent relocation when the "
931           "file is renamed."
932         },
933         { .key = {"extra-hash-regex"},
934           .type = GF_OPTION_TYPE_STR,
935           /* Setting a default here doesn't work.  See dht_init_regex. */
936           .description = "Regular expression for stripping temporary-file "
937           "suffix and prefix used by an application, to prevent relocation when "
938           "the file is renamed."
939         },
940         { .key = {"rebalance-filter"},
941           .type = GF_OPTION_TYPE_STR,
942         },
943
944         { .key = {"xattr-name"},
945           .type = GF_OPTION_TYPE_STR,
946           .default_value = "trusted.glusterfs.dht",
947           .description = "Base for extended attributes used by this "
948           "translator instance, to avoid conflicts with others above or "
949           "below it."
950         },
951
952         { .key = {"weighted-rebalance"},
953           .type = GF_OPTION_TYPE_BOOL,
954           .default_value = "on",
955           .description = "When enabled, files will be allocated to bricks "
956           "with a probability proportional to their size.  Otherwise, all "
957           "bricks will have the same probability (legacy behavior)."
958         },
959
960         /* NUFA option */
961         { .key  = {"local-volume-name"},
962           .type = GF_OPTION_TYPE_XLATOR
963         },
964
965         /* tier options */
966         { .key  = {"tier-promote-frequency"},
967           .type = GF_OPTION_TYPE_INT,
968           .default_value = "120",
969           .description = "Frequency to promote files to fast tier"
970         },
971
972         { .key  = {"tier-demote-frequency"},
973           .type = GF_OPTION_TYPE_INT,
974           .default_value = "120",
975           .description = "Frequency to demote files to slow tier"
976         },
977
978         { .key  = {"write-freq-threshold"},
979           .type = GF_OPTION_TYPE_INT,
980           .default_value = "0",
981         },
982
983         { .key  = {"read-freq-threshold"},
984           .type = GF_OPTION_TYPE_INT,
985           .default_value = "0",
986         },
987         { .key         = {"watermark-hi"},
988           .type = GF_OPTION_TYPE_PERCENT,
989           .default_value = "90",
990         },
991         { .key         = {"watermark-low"},
992           .type = GF_OPTION_TYPE_PERCENT,
993           .default_value = "75",
994         },
995         { .key         = {"tier-mode"},
996           .type = GF_OPTION_TYPE_STR,
997           .default_value = "test",
998         },
999         { .key         = {"tier-max-mb"},
1000           .type = GF_OPTION_TYPE_INT,
1001           .default_value = "1000",
1002         },
1003         { .key         = {"tier-max-files"},
1004           .type = GF_OPTION_TYPE_INT,
1005           .default_value = "5000",
1006         },
1007         /* switch option */
1008         { .key  = {"pattern.switch.case"},
1009           .type = GF_OPTION_TYPE_ANY
1010         },
1011
1012         { .key =  {"randomize-hash-range-by-gfid"},
1013           .type = GF_OPTION_TYPE_BOOL,
1014           .default_value = "off",
1015           .description = "Use gfid of directory to determine the subvolume "
1016           "from which hash ranges are allocated starting with 0. "
1017           "Note that we still use a directory/file's name to determine the "
1018           "subvolume to which it hashes"
1019         },
1020
1021         { .key =  {"rebal-throttle"},
1022           .type = GF_OPTION_TYPE_STR,
1023           .default_value = "normal",
1024           .description = " Sets the maximum number of parallel file migrations "
1025                          "allowed on a node during the rebalance operation. The"
1026                          " default value is normal and allows a max of "
1027                          "[($(processing units) - 4) / 2), 2]  files to be "
1028                          "migrated at a time. Lazy will allow only one file to "
1029                          "be migrated at a time and aggressive will allow "
1030                          "max of [($(processing units) - 4) / 2), 4]"
1031         },
1032
1033         { .key  = {NULL} },
1034 };