Project

General

Profile

Statistics
| Branch: | Tag: | Revision:

birq / balance.c @ 4534af0a

History | View | Annotate | Download (9.57 KB)

1
/* balance.c
2
 * Balance IRQs.
3
 */
4

    
5
#include <stdlib.h>
6
#include <stdio.h>
7
#include <string.h>
8
#include <sys/types.h>
9
#include <dirent.h>
10
#include <limits.h>
11
#include <ctype.h>
12
#include <sys/stat.h>
13
#include <fcntl.h>
14
#include <unistd.h> /* open, write */
15

    
16
#include "statistics.h"
17
#include "cpu.h"
18
#include "irq.h"
19
#include "balance.h"
20

    
21
/* Drop the dont_move flag on all IRQs for specified CPU */
22
static int dec_weight(cpu_t *cpu, int value)
23
{
24
        lub_list_node_t *iter;
25

    
26
        if (!cpu)
27
                return -1;
28
        if (value < 0)
29
                return -1;
30

    
31
        for (iter = lub_list_iterator_init(cpu->irqs); iter;
32
                iter = lub_list_iterator_next(iter)) {
33
                irq_t *irq;
34
                irq = (irq_t *)lub_list_node__get_data(iter);
35
                if (irq->weight >= value)
36
                        irq->weight -= value;
37
        }
38

    
39
        return 0;
40
}
41

    
42
/* Remove IRQ from specified CPU */
43
int remove_irq_from_cpu(irq_t *irq, cpu_t *cpu)
44
{
45
        lub_list_node_t *node;
46

    
47
        if (!irq || !cpu)
48
                return -1;
49

    
50
        irq->cpu = NULL;
51
        node = lub_list_search(cpu->irqs, irq);
52
        if (!node)
53
                return 0;
54
        lub_list_del(cpu->irqs, node);
55
        lub_list_node_free(node);
56

    
57
        return 0;
58
}
59

    
60
/* Move IRQ to specified CPU. Remove IRQ from the IRQ list
61
 * of old CPU.
62
 */
63
int move_irq_to_cpu(irq_t *irq, cpu_t *cpu)
64
{
65
        if (!irq || !cpu)
66
                return -1;
67

    
68
        if (irq->cpu) {
69
                cpu_t *old_cpu = irq->cpu;
70
                remove_irq_from_cpu(irq, old_cpu);
71
                dec_weight(old_cpu, 1);
72
        }
73
        dec_weight(cpu, 1);
74
        irq->cpu = cpu;
75
        lub_list_add(cpu->irqs, irq);
76

    
77
        return 0;
78
}
79

    
80
/* Search for the best CPU. Best CPU is a CPU with minimal load.
81
   If several CPUs have the same load then the best CPU is a CPU
82
   with minimal number of assigned IRQs */
83
static cpu_t *choose_cpu(lub_list_t *cpus, cpumask_t *cpumask, float load_limit)
84
{
85
        lub_list_node_t *iter;
86
        lub_list_t * min_cpus = NULL;
87
        float min_load = 100.00;
88
        lub_list_node_t *node;
89
        cpu_t *cpu = NULL;
90

    
91
        for (iter = lub_list_iterator_init(cpus); iter;
92
                iter = lub_list_iterator_next(iter)) {
93
                cpu = (cpu_t *)lub_list_node__get_data(iter);
94
                if (!cpu_isset(cpu->id, *cpumask))
95
                        continue;
96
                if (cpu->load >= load_limit)
97
                        continue;
98
                if ((!min_cpus) || (cpu->load < min_load)) {
99
                        min_load = cpu->load;
100
                        if (!min_cpus)
101
                                min_cpus = lub_list_new(cpu_list_compare_len);
102
                        while ((node = lub_list__get_tail(min_cpus))) {
103
                                lub_list_del(min_cpus, node);
104
                                lub_list_node_free(node);
105
                        }
106
                        lub_list_add(min_cpus, cpu);
107
                }
108
                if (cpu->load == min_load)
109
                        lub_list_add(min_cpus, cpu);
110
        }
111
        if (!min_cpus)
112
                return NULL;
113
        node = lub_list__get_head(min_cpus);
114
        cpu = (cpu_t *)lub_list_node__get_data(node);
115
        while ((node = lub_list__get_tail(min_cpus))) {
116
                lub_list_del(min_cpus, node);
117
                lub_list_node_free(node);
118
        }
119
        lub_list_free(min_cpus);
120

    
121
        return cpu;
122
}
123

    
124
static int irq_set_affinity(irq_t *irq, cpumask_t *cpumask)
125
{
126
        char path[PATH_MAX];
127
        char buf[NR_CPUS + 1];
128
        int f;
129

    
130
        if (!irq)
131
                return -1;
132

    
133
        snprintf(path, sizeof(path),
134
                "%s/%u/smp_affinity", PROC_IRQ, irq->irq);
135
        path[sizeof(path) - 1] = '\0';
136
        if ((f = open(path, O_WRONLY | O_SYNC)) < 0)
137
                return -1;
138
        cpumask_scnprintf(buf, sizeof(buf), *cpumask);
139
        buf[sizeof(buf) - 1] = '\0';
140
        if (write(f, buf, strlen(buf)) < 0) {
141
                /* The affinity for some IRQ can't be changed. So don't
142
                   consider such IRQs. The example is IRQ 0 - timer.
143
                   Blacklist this IRQ. Note fprintf() without fflush()
144
                   will not return I/O error due to buffers. */
145
                irq->blacklisted = 1;
146
                remove_irq_from_cpu(irq, irq->cpu);
147
                printf("Blacklist IRQ %u\n", irq->irq);
148
        }
149
        close(f);
150

    
151
        return 0;
152
}
153

    
154
/* Find best CPUs for IRQs need to be balanced. */
155
int balance(lub_list_t *cpus, lub_list_t *balance_irqs,
156
        float load_limit, cpumask_t *exclude_cpus)
157
{
158
        lub_list_node_t *iter;
159

    
160
        for (iter = lub_list_iterator_init(balance_irqs); iter;
161
                iter = lub_list_iterator_next(iter)) {
162
                irq_t *irq;
163
                cpu_t *cpu;
164
                cpumask_t possible_cpus;
165

    
166
                irq = (irq_t *)lub_list_node__get_data(iter);
167
                /* Try to find local CPU to move IRQ to.
168
                   The local CPU is CPU with native NUMA node. */
169
                /* Possible CPUs is local CPUs minus exclude-CPUs.
170
                   possible_cpus = local_cpus & ~exclude_cpus */
171
                cpus_init(possible_cpus);
172
                cpus_copy(possible_cpus, *exclude_cpus);
173
                cpus_complement(possible_cpus, possible_cpus);
174
                cpus_and(possible_cpus, possible_cpus, irq->local_cpus);
175
                cpu = choose_cpu(cpus, &possible_cpus, load_limit);
176
                cpus_free(possible_cpus);
177
                /* If local CPU is not found then try to use
178
                   CPU from another NUMA node. It's better then
179
                   overloaded CPUs. */
180
                /* Non-local CPUs were disabled. It seems there is
181
                   no advantages to use them. The all interactions will
182
                   be held by QPI-like interfaces through local CPUs. */
183
/*                if (!cpu) {
184
                        cpumask_t complement;
185
                        cpus_init(complement);
186
                        cpus_complement(complement, irq->local_cpus);
187
                        cpu = choose_cpu(cpus, &complement, load_limit);
188
                        cpus_free(complement);
189
                }
190
*/
191
                if (cpu) {
192
                        if (irq->cpu)
193
                                printf("Move IRQ %u from CPU%u to CPU%u\n",
194
                                        irq->irq, irq->cpu->id, cpu->id);
195
                        else
196
                                printf("Move IRQ %u to CPU%u\n", irq->irq, cpu->id);
197
                        move_irq_to_cpu(irq, cpu);
198
                }
199
        }
200

    
201
        return 0;
202
}
203

    
204
int apply_affinity(lub_list_t *balance_irqs)
205
{
206
        lub_list_node_t *iter;
207

    
208
        for (iter = lub_list_iterator_init(balance_irqs); iter;
209
                iter = lub_list_iterator_next(iter)) {
210
                irq_t *irq;
211
                irq = (irq_t *)lub_list_node__get_data(iter);
212
                if (!irq->cpu)
213
                        continue;
214
                irq_set_affinity(irq, &(irq->cpu->cpumask));
215
        }
216
        return 0;
217
}
218

    
219

    
220
/* Count the number of intr-not-null IRQs and minimal IRQ weight */
221
static int irq_list_info(lub_list_t *irqs, int *min_weight,
222
        unsigned int *irq_num, unsigned int *candidates_num)
223
{
224
        lub_list_node_t *iter;
225

    
226
        if (!irqs)
227
                return -1;
228

    
229
        if (min_weight)
230
                *min_weight = -1;
231
        if (irq_num)
232
                *irq_num = 0;
233
        if (candidates_num)
234
                *candidates_num = 0;
235
        for (iter = lub_list_iterator_init(irqs); iter;
236
                iter = lub_list_iterator_next(iter)) {
237
                irq_t *irq = (irq_t *)lub_list_node__get_data(iter);
238
                if (irq->intr == 0)
239
                        continue;
240
                if (min_weight) {
241
                        if ((*min_weight < 0) || (irq->weight < *min_weight))
242
                                *min_weight = irq->weight;
243
                }
244
                if (irq_num)
245
                        *irq_num += 1;
246
                if (irq->weight)
247
                        continue;
248
                if (candidates_num)
249
                        *candidates_num += 1;
250
        }
251

    
252
        return 0;
253
}
254

    
255
/* Search for most overloaded CPU */
256
static cpu_t * most_overloaded_cpu(lub_list_t *cpus, float threshold)
257
{
258
        lub_list_node_t *iter;
259
        cpu_t *overloaded_cpu = NULL;
260
        float max_load = 0.0;
261

    
262
        /* Search for the most overloaded CPU.
263
           The load must be greater than threshold. */
264
        for (iter = lub_list_iterator_init(cpus); iter;
265
                iter = lub_list_iterator_next(iter)) {
266
                cpu_t *cpu = (cpu_t *)lub_list_node__get_data(iter);
267
                int min_weight = -1;
268
                unsigned int irq_num = 0;
269

    
270
                if (cpu->load < threshold)
271
                        continue;
272
                if (cpu->load <= max_load)
273
                        continue;
274

    
275
                /* Don't move last IRQ */
276
                if (lub_list_len(cpu->irqs) <= 1)
277
                        continue;
278

    
279
                irq_list_info(cpu->irqs, &min_weight, &irq_num, NULL);
280
                /* All IRQs has intr=0 */
281
                if (irq_num == 0)
282
                        continue;
283
                if (min_weight > 0)
284
                        dec_weight(cpu, min_weight);
285

    
286
                /* Ok, it's good CPU to try to free it */
287
                max_load = cpu->load;
288
                overloaded_cpu = cpu;
289
        }
290

    
291
        return overloaded_cpu;
292
}
293

    
294
/* Search for the overloaded CPUs and then choose best IRQ for moving to
295
   another CPU. The best IRQ is IRQ with maximum number of interrupts.
296
   The IRQs with small number of interrupts have very low load or very
297
   high load (in a case of NAPI). */
298
int choose_irqs_to_move(lub_list_t *cpus, lub_list_t *balance_irqs,
299
        float threshold, birq_choose_strategy_e strategy,
300
        cpumask_t *exclude_cpus)
301
{
302
        lub_list_node_t *iter;
303
        cpu_t *overloaded_cpu = NULL;
304
        irq_t *irq_to_move = NULL;
305
        unsigned long long max_intr = 0;
306
        unsigned long long min_intr = (unsigned long long)(-1);
307
        unsigned int choose = 0;
308
        unsigned int current = 0;
309

    
310
        /* Stage 1: Try to move active IRQs from excluded-CPUs */
311

    
312
        if (!cpus_empty(*exclude_cpus)) {
313
                /* Iterate CPU list and find excluded ones */
314
                for (iter = lub_list_iterator_init(cpus); iter;
315
                        iter = lub_list_iterator_next(iter)) {
316
                        lub_list_node_t *iter2;
317
                        cpu_t *cpu = (cpu_t *)lub_list_node__get_data(iter);
318
                        if (!cpu_isset(cpu->id, *exclude_cpus))
319
                                continue;
320
                        /* Move all active IRQs to another CPUs */
321
                        for (iter2 = lub_list_iterator_init(cpu->irqs); iter2;
322
                                iter2 = lub_list_iterator_next(iter2)) {
323
                                irq_t *irq = (irq_t *)lub_list_node__get_data(iter2);
324
                                if (irq->intr == 0)
325
                                        continue;
326
                                lub_list_add(balance_irqs, irq);
327
                        }
328
                }
329
        }
330

    
331
        /* Stage 2: Move IRQs from overloaded CPUs */
332

    
333
        /* Search for overloaded CPUs */
334
        if (!(overloaded_cpu = most_overloaded_cpu(cpus, threshold)))
335
                return 0;
336

    
337
        if (strategy == BIRQ_CHOOSE_RND) {
338
                unsigned int candidates = 0;
339
                irq_list_info(overloaded_cpu->irqs, NULL, NULL, &candidates);
340
                if (candidates == 0)
341
                        return 0;
342
                choose = rand() % candidates;
343
        }
344

    
345
        /* Search for the IRQ (owned by overloaded CPU) with
346
           maximum/minimum number of interrupts. */
347
        for (iter = lub_list_iterator_init(overloaded_cpu->irqs); iter;
348
                iter = lub_list_iterator_next(iter)) {
349
                irq_t *irq = (irq_t *)lub_list_node__get_data(iter);
350
                /* Don't move any IRQs with intr=0. It can be unused IRQ. In
351
                   this case the moving is not needed. It can be overloaded
352
                   (by NAPI) IRQs. In this case it will be not moved anyway. */
353
                if (irq->intr == 0)
354
                        continue;
355
                if (irq->weight)
356
                        continue;
357
                if (strategy == BIRQ_CHOOSE_MAX) {
358
                        /* Get IRQ with max intr */
359
                        if (irq->intr > max_intr) {
360
                                max_intr = irq->intr;
361
                                irq_to_move = irq;
362
                        }
363
                } else if (strategy == BIRQ_CHOOSE_MIN) {
364
                        /* Get IRQ with min intr */
365
                        if (irq->intr < min_intr) {
366
                                min_intr = irq->intr;
367
                                irq_to_move = irq;
368
                        }
369
                } else if (strategy == BIRQ_CHOOSE_RND) {
370
                        if (current == choose) {
371
                                irq_to_move = irq;
372
                                break;
373
                        }
374
                }
375
                current++;
376
        }
377

    
378
        if (irq_to_move) {
379
                /* Don't move this IRQ while next iteration. */
380
                irq_to_move->weight = 1;
381
                lub_list_add(balance_irqs, irq_to_move);
382
        }
383

    
384
        return 0;
385
}