diff -r f49a0cab20aa include/crm/msg_xml.h --- a/include/crm/msg_xml.h Thu Nov 12 12:18:10 2009 +0100 +++ b/include/crm/msg_xml.h Fri Nov 13 14:08:16 2009 +0800 @@ -130,6 +130,7 @@ #define XML_TAG_ATTRS "attributes" #define XML_TAG_PARAMS "parameters" #define XML_TAG_PARAM "param" +#define XML_TAG_UTILIZATION "utilization" #define XML_TAG_RESOURCE_REF "resource_ref" #define XML_CIB_TAG_RESOURCE "primitive" diff -r f49a0cab20aa include/crm/pengine/status.h --- a/include/crm/pengine/status.h Thu Nov 12 12:18:10 2009 +0100 +++ b/include/crm/pengine/status.h Fri Nov 13 14:08:16 2009 +0800 @@ -68,6 +68,7 @@ char *dc_uuid; node_t *dc_node; const char *stonith_action; + const char *placement_strategy; unsigned long long flags; @@ -116,6 +117,8 @@ GHashTable *attrs; /* char* => char* */ enum node_type type; + + GHashTable *utilization; }; struct node_s { @@ -186,6 +189,7 @@ GHashTable *meta; GHashTable *parameters; + GHashTable *utilization; GListPtr children; /* resource_t* */ }; diff -r f49a0cab20aa lib/pengine/common.c --- a/lib/pengine/common.c Thu Nov 12 12:18:10 2009 +0100 +++ b/lib/pengine/common.c Fri Nov 13 14:08:16 2009 +0800 @@ -80,6 +80,24 @@ return FALSE; } +static gboolean +check_placement_strategy(const char *value) +{ + if(safe_str_eq(value, "default")) { + return TRUE; + + } else if(safe_str_eq(value, "utilization")) { + return TRUE; + + } else if(safe_str_eq(value, "minimal")) { + return TRUE; + + } else if(safe_str_eq(value, "balanced")) { + return TRUE; + } + return FALSE; +} + pe_cluster_option pe_opts[] = { /* name, old-name, validate, default, description */ { "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, @@ -147,6 +165,10 @@ { "node-health-red", NULL, "integer", NULL, "-INFINITY", &check_number, "The score 'red' translates to in rsc_location constraints", "Only used when node-health-strategy is set to custom or progressive." }, + + /*Placement Strategy*/ + { "placement-strategy", NULL, "enum", "default, utilization, minimal, balanced", "default", &check_placement_strategy, + "The strategy to determine resource placement", NULL}, }; void diff -r f49a0cab20aa lib/pengine/complex.c --- a/lib/pengine/complex.c Thu Nov 12 12:18:10 2009 +0100 +++ b/lib/pengine/complex.c Fri Nov 13 14:08:16 2009 +0800 @@ -371,6 +371,12 @@ if(safe_str_eq(class, "stonith")) { set_bit_inplace(data_set->flags, pe_flag_have_stonith_resource); } + + (*rsc)->utilization = g_hash_table_new_full( + g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); + + unpack_instance_attributes(data_set->input, (*rsc)->xml, XML_TAG_UTILIZATION, NULL, + (*rsc)->utilization, NULL, FALSE, data_set->now); /* data_set->resources = g_list_append(data_set->resources, (*rsc)); */ return TRUE; @@ -451,6 +457,9 @@ if(rsc->meta != NULL) { g_hash_table_destroy(rsc->meta); } + if(rsc->utilization != NULL) { + g_hash_table_destroy(rsc->utilization); + } if(rsc->parent == NULL && is_set(rsc->flags, pe_rsc_orphan)) { free_xml(rsc->xml); } diff -r f49a0cab20aa lib/pengine/status.c --- a/lib/pengine/status.c Thu Nov 12 12:18:10 2009 +0100 +++ b/lib/pengine/status.c Fri Nov 13 14:08:16 2009 +0800 @@ -159,6 +159,9 @@ if(details->attrs != NULL) { g_hash_table_destroy(details->attrs); } + if(details->utilization != NULL) { + g_hash_table_destroy(details->utilization); + } pe_free_shallow_adv(details->running_rsc, FALSE); pe_free_shallow_adv(details->allocated_rsc, FALSE); crm_free(details); diff -r f49a0cab20aa lib/pengine/unpack.c --- a/lib/pengine/unpack.c Thu Nov 12 12:18:10 2009 +0100 +++ b/lib/pengine/unpack.c Fri Nov 13 14:08:16 2009 +0800 @@ -165,6 +165,9 @@ crm_info("Node scores: 'red' = %s, 'yellow' = %s, 'green' = %s", score2char(node_score_red),score2char(node_score_yellow), score2char(node_score_green)); + + data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy"); + crm_debug_2("Placement strategy: %s", data_set->placement_strategy); return TRUE; } @@ -233,6 +236,9 @@ new_node->details->attrs = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); + new_node->details->utilization = g_hash_table_new_full( + g_str_hash, g_str_equal, + g_hash_destroy_str, g_hash_destroy_str); /* if(data_set->have_quorum == FALSE */ /* && data_set->no_quorum_policy == no_quorum_stop) { */ @@ -258,6 +264,10 @@ } add_node_attrs(xml_obj, new_node, FALSE, data_set); + unpack_instance_attributes( + data_set->input, xml_obj, XML_TAG_UTILIZATION, NULL, + new_node->details->utilization, NULL, FALSE, data_set->now); + data_set->nodes = g_list_append(data_set->nodes, new_node); crm_debug_3("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME)); diff -r f49a0cab20aa pengine/clone.c --- a/pengine/clone.c Thu Nov 12 12:18:10 2009 +0100 +++ b/pengine/clone.c Fri Nov 13 14:08:16 2009 +0800 @@ -26,7 +26,7 @@ #define VARIANT_CLONE 1 #include -gint sort_clone_instance(gconstpointer a, gconstpointer b); +gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set); void child_stopping_constraints( clone_variant_data_t *clone_data, @@ -65,7 +65,7 @@ } -gint sort_clone_instance(gconstpointer a, gconstpointer b) +gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set) { int level = LOG_DEBUG_3; node_t *node1 = NULL; @@ -201,8 +201,8 @@ GListPtr list1 = node_list_dup(resource1->allowed_nodes, FALSE, FALSE); GListPtr list2 = node_list_dup(resource2->allowed_nodes, FALSE, FALSE); - list1 = g_list_sort(list1, sort_node_weight); - list2 = g_list_sort(list2, sort_node_weight); + list1 = g_list_sort_with_data(list1, sort_node_weight, data_set); + list2 = g_list_sort_with_data(list2, sort_node_weight, data_set); max = g_list_length(list1); if(max < g_list_length(list2)) { max = g_list_length(list2); @@ -275,8 +275,8 @@ constraint->score/INFINITY, FALSE); ); - list1 = g_list_sort(list1, sort_node_weight); - list2 = g_list_sort(list2, sort_node_weight); + list1 = g_list_sort_with_data(list1, sort_node_weight, data_set); + list2 = g_list_sort_with_data(list2, sort_node_weight, data_set); max = g_list_length(list1); if(max < g_list_length(list2)) { max = g_list_length(list2); @@ -457,15 +457,15 @@ } ); - rsc->children = g_list_sort(rsc->children, sort_clone_instance); + rsc->children = g_list_sort_with_data(rsc->children, sort_clone_instance, data_set); /* count now tracks the number of clones we have allocated */ slist_iter(node, node_t, rsc->allowed_nodes, lpc, node->count = 0; ); - rsc->allowed_nodes = g_list_sort( - rsc->allowed_nodes, sort_node_weight); + rsc->allowed_nodes = g_list_sort_with_data( + rsc->allowed_nodes, sort_node_weight, data_set); slist_iter(node, node_t, rsc->allowed_nodes, lpc, if(can_run_resources(node)) { diff -r f49a0cab20aa pengine/master.c --- a/pengine/master.c Thu Nov 12 12:18:10 2009 +0100 +++ b/pengine/master.c Fri Nov 13 14:08:16 2009 +0800 @@ -26,7 +26,7 @@ #define VARIANT_CLONE 1 #include -extern gint sort_clone_instance(gconstpointer a, gconstpointer b); +extern gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set); extern int master_score(resource_t *rsc, node_t *node, int not_set_value); @@ -227,7 +227,7 @@ return NULL; } -static gint sort_master_instance(gconstpointer a, gconstpointer b) +static gint sort_master_instance(gconstpointer a, gconstpointer b, gpointer data_set) { int rc; enum rsc_role_e role1 = RSC_ROLE_UNKNOWN; @@ -254,10 +254,10 @@ return 1; } - return sort_clone_instance(a, b); + return sort_clone_instance(a, b, data_set); } -static void master_promotion_order(resource_t *rsc) +static void master_promotion_order(resource_t *rsc, pe_working_set_t *data_set) { node_t *node = NULL; node_t *chosen = NULL; @@ -340,7 +340,7 @@ crm_debug_2("%s: %d", child->id, child->sort_index); ); - rsc->children = g_list_sort(rsc->children, sort_master_instance); + rsc->children = g_list_sort_with_data(rsc->children, sort_master_instance, data_set); } int @@ -591,7 +591,7 @@ ); - master_promotion_order(rsc); + master_promotion_order(rsc, data_set); /* mark the first N as masters */ slist_iter( diff -r f49a0cab20aa pengine/native.c --- a/pengine/native.c Thu Nov 12 12:18:10 2009 +0100 +++ b/pengine/native.c Fri Nov 13 14:08:16 2009 +0800 @@ -75,7 +75,31 @@ static gboolean -native_choose_node(resource_t *rsc) +have_enough_capacity(node_t *node, resource_t *rsc) +{ + GHashTableIter iter; + const char *key = NULL; + const char *value = NULL; + int required = 0; + int remaining = 0; + int rc = TRUE; + + g_hash_table_iter_init(&iter, rsc->utilization); + while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) { + required = crm_parse_int(value, "0"); + remaining = crm_parse_int(g_hash_table_lookup(node->details->utilization, key), "0"); + + if (required > remaining) { + crm_debug("Node %s has no enough %s for resource %s: required=%d remaining=%d", + node->details->uname, key, rsc->id, required, remaining); + rc = FALSE; + } + } + return rc; +} + +static gboolean +native_choose_node(resource_t *rsc, pe_working_set_t *data_set) { /* 1. Sort by weight @@ -83,12 +107,28 @@ with the fewest resources 3. remove color.chosen_node from all other colors */ + int alloc_details = scores_log_level+1; + GListPtr nodes = NULL; node_t *chosen = NULL; int lpc = 0; int multiple = 0; - int length = g_list_length(rsc->allowed_nodes); + int length = 0; + + if (safe_str_neq(data_set->placement_strategy, "default")) { + slist_iter( + node, node_t, data_set->nodes, lpc, + if (have_enough_capacity(node, rsc) == FALSE) { + crm_debug("Resource %s cannot be allocated to node %s: none of enough capacity", + rsc->id, node->details->uname); + resource_location(rsc, node, -INFINITY, "__limit_utilization_", data_set); + } + ); + dump_node_scores(alloc_details, rsc, "Post-utilization", rsc->allowed_nodes); + } + + length = g_list_length(rsc->allowed_nodes); if(is_not_set(rsc->flags, pe_rsc_provisional)) { return rsc->allocated_to?TRUE:FALSE; @@ -98,7 +138,7 @@ rsc->id, length); if(rsc->allowed_nodes) { - rsc->allowed_nodes = g_list_sort(rsc->allowed_nodes, sort_node_weight); + rsc->allowed_nodes = g_list_sort_with_data(rsc->allowed_nodes, sort_node_weight, data_set); nodes = rsc->allowed_nodes; chosen = g_list_nth_data(nodes, 0); @@ -327,7 +367,7 @@ native_assign_node(rsc, NULL, NULL, TRUE); } else if(is_set(rsc->flags, pe_rsc_provisional) - && native_choose_node(rsc) ) { + && native_choose_node(rsc, data_set) ) { crm_debug_3("Allocated resource %s to %s", rsc->id, rsc->allocated_to->details->uname); diff -r f49a0cab20aa pengine/utils.c --- a/pengine/utils.c Thu Nov 12 12:18:10 2009 +0100 +++ b/pengine/utils.c Fri Nov 13 14:08:16 2009 +0800 @@ -189,17 +189,65 @@ return TRUE; } + +/* rc < 0 if 'node1' has more capacity remaining + * rc > 0 if 'node1' has less capacity remaining + */ +static int +compare_capacity(const node_t *node1, const node_t *node2) +{ + GHashTableIter iter; + const char *key = NULL; + const char *value = NULL; + int node1_capacity = 0; + int node2_capacity = 0; + int result = 0; + + g_hash_table_iter_init(&iter, node1->details->utilization); + while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) { + node1_capacity = crm_parse_int(value, "0"); + node2_capacity = crm_parse_int(g_hash_table_lookup(node2->details->utilization, key), "0"); + + if (node1_capacity > node2_capacity) { + result += -1; + } else if (node1_capacity < node2_capacity) { + result += 1; + } + } + + g_hash_table_iter_init(&iter, node2->details->utilization); + while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) { + if (g_hash_table_lookup_extended(node1->details->utilization, key, NULL, NULL)) { + continue; + } + + node1_capacity = 0; + node2_capacity = crm_parse_int(value, "0"); + + if (node1_capacity > node2_capacity) { + result += -1; + } else if (node1_capacity < node2_capacity) { + result += 1; + } + } + + return result; +} + /* return -1 if 'a' is more preferred * return 1 if 'b' is more preferred */ -gint sort_node_weight(gconstpointer a, gconstpointer b) +gint sort_node_weight(gconstpointer a, gconstpointer b, gpointer data) { int level = LOG_DEBUG_3; const node_t *node1 = (const node_t*)a; const node_t *node2 = (const node_t*)b; + const pe_working_set_t *data_set = (const pe_working_set_t*)data; int node1_weight = 0; int node2_weight = 0; + + int result = 0; if(a == NULL) { return 1; } if(b == NULL) { return -1; } @@ -231,6 +279,17 @@ do_crm_log_unlikely(level, "%s (%d) == %s (%d) : weight", node1->details->uname, node1_weight, node2->details->uname, node2_weight); + + if (safe_str_eq(data_set->placement_strategy, "minimal")) { + goto equal; + } + + if (safe_str_eq(data_set->placement_strategy, "balanced")) { + result = compare_capacity(node1, node2); + if (result != 0) { + return result; + } + } /* now try to balance resources across the cluster */ if(node1->details->num_resources @@ -248,10 +307,36 @@ return 1; } +equal: do_crm_log_unlikely(level, "%s = %s", node1->details->uname, node2->details->uname); return 0; } +/* Specify 'allocate' to TRUE when allocating + * Otherwise to FALSE when deallocating + */ +static void +calculate_utilization(node_t *node, resource_t *rsc, gboolean allocate) +{ + GHashTableIter iter; + const char *key = NULL; + const char *value = NULL; + const char *capacity = NULL; + char *remain_capacity = NULL; + + g_hash_table_iter_init(&iter, rsc->utilization); + while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) { + capacity = g_hash_table_lookup(node->details->utilization, key); + if (capacity) { + if (allocate) { + remain_capacity = crm_itoa(crm_parse_int(capacity, "0") - crm_parse_int(value, "0")); + } else { + remain_capacity = crm_itoa(crm_parse_int(capacity, "0") + crm_parse_int(value, "0")); + } + g_hash_table_replace(node->details->utilization, crm_strdup(key), remain_capacity); + } + } +} gboolean native_assign_node(resource_t *rsc, GListPtr nodes, node_t *chosen, gboolean force) @@ -284,6 +369,7 @@ old->details->allocated_rsc, rsc); old->details->num_resources--; old->count--; + calculate_utilization(old, rsc, FALSE); } crm_debug("Assigning %s to %s", chosen->details->uname, rsc->id); @@ -293,6 +379,7 @@ chosen->details->allocated_rsc = g_list_append(chosen->details->allocated_rsc, rsc); chosen->details->num_resources++; chosen->count++; + calculate_utilization(chosen, rsc, TRUE); return TRUE; } diff -r f49a0cab20aa pengine/utils.h --- a/pengine/utils.h Thu Nov 12 12:18:10 2009 +0100 +++ b/pengine/utils.h Fri Nov 13 14:08:16 2009 +0800 @@ -47,7 +47,7 @@ extern rsc_to_node_t *generate_location_rule( resource_t *rsc, xmlNode *location_rule, pe_working_set_t *data_set); -extern gint sort_node_weight(gconstpointer a, gconstpointer b); +extern gint sort_node_weight(gconstpointer a, gconstpointer b, gpointer data_set); extern gboolean can_run_resources(const node_t *node); extern gboolean native_assign_node(resource_t *rsc, GListPtr candidates, node_t *chosen, gboolean force); diff -r f49a0cab20aa xml/pacemaker.rng.in --- a/xml/pacemaker.rng.in Thu Nov 12 12:18:10 2009 +0100 +++ b/xml/pacemaker.rng.in Fri Nov 13 14:08:16 2009 +0800 @@ -104,9 +104,14 @@ - - - + + + + + + + + diff -r f49a0cab20aa xml/resources.rng.in --- a/xml/resources.rng.in Thu Nov 12 12:18:10 2009 +0100 +++ b/xml/resources.rng.in Fri Nov 13 14:08:16 2009 +0800 @@ -39,6 +39,11 @@ + + + + +