diff options
-rw-r--r-- | drivers/char/random.c | 1515 | ||||
-rw-r--r-- | include/linux/sysctl.h | 217 | ||||
-rw-r--r-- | include/trace/events/random.h | 134 | ||||
-rw-r--r-- | kernel/sysctl.c | 2671 |
4 files changed, 4537 insertions, 0 deletions
diff --git a/drivers/char/random.c b/drivers/char/random.c new file mode 100644 index 0000000..0d91fe5 --- /dev/null +++ b/drivers/char/random.c @@ -0,0 +1,1515 @@ +/* + * random.c -- A strong random number generator + * + * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005 + * + * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All + * rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * (now, with legal B.S. out of the way.....) + * + * This routine gathers environmental noise from device drivers, etc., + * and returns good random numbers, suitable for cryptographic use. + * Besides the obvious cryptographic uses, these numbers are also good + * for seeding TCP sequence numbers, and other places where it is + * desirable to have numbers which are not only random, but hard to + * predict by an attacker. + * + * Theory of operation + * =================== + * + * Computers are very predictable devices. Hence it is extremely hard + * to produce truly random numbers on a computer --- as opposed to + * pseudo-random numbers, which can easily generated by using a + * algorithm. Unfortunately, it is very easy for attackers to guess + * the sequence of pseudo-random number generators, and for some + * applications this is not acceptable. So instead, we must try to + * gather "environmental noise" from the computer's environment, which + * must be hard for outside attackers to observe, and use that to + * generate random numbers. In a Unix environment, this is best done + * from inside the kernel. + * + * Sources of randomness from the environment include inter-keyboard + * timings, inter-interrupt timings from some interrupts, and other + * events which are both (a) non-deterministic and (b) hard for an + * outside observer to measure. Randomness from these sources are + * added to an "entropy pool", which is mixed using a CRC-like function. + * This is not cryptographically strong, but it is adequate assuming + * the randomness is not chosen maliciously, and it is fast enough that + * the overhead of doing it on every interrupt is very reasonable. + * As random bytes are mixed into the entropy pool, the routines keep + * an *estimate* of how many bits of randomness have been stored into + * the random number generator's internal state. + * + * When random bytes are desired, they are obtained by taking the SHA + * hash of the contents of the "entropy pool". The SHA hash avoids + * exposing the internal state of the entropy pool. It is believed to + * be computationally infeasible to derive any useful information + * about the input of SHA from its output. Even if it is possible to + * analyze SHA in some clever way, as long as the amount of data + * returned from the generator is less than the inherent entropy in + * the pool, the output data is totally unpredictable. For this + * reason, the routine decreases its internal estimate of how many + * bits of "true randomness" are contained in the entropy pool as it + * outputs random numbers. + * + * If this estimate goes to zero, the routine can still generate + * random numbers; however, an attacker may (at least in theory) be + * able to infer the future output of the generator from prior + * outputs. This requires successful cryptanalysis of SHA, which is + * not believed to be feasible, but there is a remote possibility. + * Nonetheless, these numbers should be useful for the vast majority + * of purposes. + * + * Exported interfaces ---- output + * =============================== + * + * There are three exported interfaces; the first is one designed to + * be used from within the kernel: + * + * void get_random_bytes(void *buf, int nbytes); + * + * This interface will return the requested number of random bytes, + * and place it in the requested buffer. + * + * The two other interfaces are two character devices /dev/random and + * /dev/urandom. /dev/random is suitable for use when very high + * quality randomness is desired (for example, for key generation or + * one-time pads), as it will only return a maximum of the number of + * bits of randomness (as estimated by the random number generator) + * contained in the entropy pool. + * + * The /dev/urandom device does not have this limit, and will return + * as many bytes as are requested. As more and more random bytes are + * requested without giving time for the entropy pool to recharge, + * this will result in random numbers that are merely cryptographically + * strong. For many applications, however, this is acceptable. + * + * Exported interfaces ---- input + * ============================== + * + * The current exported interfaces for gathering environmental noise + * from the devices are: + * + * void add_device_randomness(const void *buf, unsigned int size); + * void add_input_randomness(unsigned int type, unsigned int code, + * unsigned int value); + * void add_interrupt_randomness(int irq, int irq_flags); + * void add_disk_randomness(struct gendisk *disk); + * + * add_device_randomness() is for adding data to the random pool that + * is likely to differ between two devices (or possibly even per boot). + * This would be things like MAC addresses or serial numbers, or the + * read-out of the RTC. This does *not* add any actual entropy to the + * pool, but it initializes the pool to different values for devices + * that might otherwise be identical and have very little entropy + * available to them (particularly common in the embedded world). + * + * add_input_randomness() uses the input layer interrupt timing, as well as + * the event type information from the hardware. + * + * add_interrupt_randomness() uses the interrupt timing as random + * inputs to the entropy pool. Using the cycle counters and the irq source + * as inputs, it feeds the randomness roughly once a second. + * + * add_disk_randomness() uses what amounts to the seek time of block + * layer request events, on a per-disk_devt basis, as input to the + * entropy pool. Note that high-speed solid state drives with very low + * seek times do not make for good sources of entropy, as their seek + * times are usually fairly consistent. + * + * All of these routines try to estimate how many bits of randomness a + * particular randomness source. They do this by keeping track of the + * first and second order deltas of the event timings. + * + * Ensuring unpredictability at system startup + * ============================================ + * + * When any operating system starts up, it will go through a sequence + * of actions that are fairly predictable by an adversary, especially + * if the start-up does not involve interaction with a human operator. + * This reduces the actual number of bits of unpredictability in the + * entropy pool below the value in entropy_count. In order to + * counteract this effect, it helps to carry information in the + * entropy pool across shut-downs and start-ups. To do this, put the + * following lines an appropriate script which is run during the boot + * sequence: + * + * echo "Initializing random number generator..." + * random_seed=/var/run/random-seed + * # Carry a random seed from start-up to start-up + * # Load and then save the whole entropy pool + * if [ -f $random_seed ]; then + * cat $random_seed >/dev/urandom + * else + * touch $random_seed + * fi + * chmod 600 $random_seed + * dd if=/dev/urandom of=$random_seed count=1 bs=512 + * + * and the following lines in an appropriate script which is run as + * the system is shutdown: + * + * # Carry a random seed from shut-down to start-up + * # Save the whole entropy pool + * echo "Saving random seed..." + * random_seed=/var/run/random-seed + * touch $random_seed + * chmod 600 $random_seed + * dd if=/dev/urandom of=$random_seed count=1 bs=512 + * + * For example, on most modern systems using the System V init + * scripts, such code fragments would be found in + * /etc/rc.d/init.d/random. On older Linux systems, the correct script + * location might be in /etc/rcb.d/rc.local or /etc/rc.d/rc.0. + * + * Effectively, these commands cause the contents of the entropy pool + * to be saved at shut-down time and reloaded into the entropy pool at + * start-up. (The 'dd' in the addition to the bootup script is to + * make sure that /etc/random-seed is different for every start-up, + * even if the system crashes without executing rc.0.) Even with + * complete knowledge of the start-up activities, predicting the state + * of the entropy pool requires knowledge of the previous history of + * the system. + * + * Configuring the /dev/random driver under Linux + * ============================================== + * + * The /dev/random driver under Linux uses minor numbers 8 and 9 of + * the /dev/mem major number (#1). So if your system does not have + * /dev/random and /dev/urandom created already, they can be created + * by using the commands: + * + * mknod /dev/random c 1 8 + * mknod /dev/urandom c 1 9 + * + * Acknowledgements: + * ================= + * + * Ideas for constructing this random number generator were derived + * from Pretty Good Privacy's random number generator, and from private + * discussions with Phil Karn. Colin Plumb provided a faster random + * number generator, which speed up the mixing function of the entropy + * pool, taken from PGPfone. Dale Worley has also contributed many + * useful ideas and suggestions to improve this driver. + * + * Any flaws in the design are solely my responsibility, and should + * not be attributed to the Phil, Colin, or any of authors of PGP. + * + * Further background information on this topic may be obtained from + * RFC 1750, "Randomness Recommendations for Security", by Donald + * Eastlake, Steve Crocker, and Jeff Schiller. + */ + +#include <linux/utsname.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/string.h> +#include <linux/fcntl.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/poll.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/interrupt.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/percpu.h> +#include <linux/cryptohash.h> +#include <linux/fips.h> +#include <linux/ptrace.h> +#include <linux/kmemcheck.h> + +#ifdef CONFIG_GENERIC_HARDIRQS +# include <linux/irq.h> +#endif + +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/irq.h> +#include <asm/irq_regs.h> +#include <asm/io.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/random.h> + +/* + * Configuration information + */ +#define INPUT_POOL_WORDS 128 +#define OUTPUT_POOL_WORDS 32 +#define SEC_XFER_SIZE 512 +#define EXTRACT_SIZE 10 + +#define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long)) + +/* + * The minimum number of bits of entropy before we wake up a read on + * /dev/random. Should be enough to do a significant reseed. + */ +static int random_read_wakeup_thresh = 64; + +/* + * If the entropy count falls under this number of bits, then we + * should wake up processes which are selecting or polling on write + * access to /dev/random. + */ +static int random_write_wakeup_thresh = 128; + +/* + * When the input pool goes over trickle_thresh, start dropping most + * samples to avoid wasting CPU time and reduce lock contention. + */ + +static int trickle_thresh __read_mostly = INPUT_POOL_WORDS * 28; + +static DEFINE_PER_CPU(int, trickle_count); + +/* + * A pool of size .poolwords is stirred with a primitive polynomial + * of degree .poolwords over GF(2). The taps for various sizes are + * defined below. They are chosen to be evenly spaced (minimum RMS + * distance from evenly spaced; the numbers in the comments are a + * scaled squared error sum) except for the last tap, which is 1 to + * get the twisting happening as fast as possible. + */ +static struct poolinfo { + int poolwords; + int tap1, tap2, tap3, tap4, tap5; +} poolinfo_table[] = { + /* x^128 + x^103 + x^76 + x^51 +x^25 + x + 1 -- 105 */ + { 128, 103, 76, 51, 25, 1 }, + /* x^32 + x^26 + x^20 + x^14 + x^7 + x + 1 -- 15 */ + { 32, 26, 20, 14, 7, 1 }, +#if 0 + /* x^2048 + x^1638 + x^1231 + x^819 + x^411 + x + 1 -- 115 */ + { 2048, 1638, 1231, 819, 411, 1 }, + + /* x^1024 + x^817 + x^615 + x^412 + x^204 + x + 1 -- 290 */ + { 1024, 817, 615, 412, 204, 1 }, + + /* x^1024 + x^819 + x^616 + x^410 + x^207 + x^2 + 1 -- 115 */ + { 1024, 819, 616, 410, 207, 2 }, + + /* x^512 + x^411 + x^308 + x^208 + x^104 + x + 1 -- 225 */ + { 512, 411, 308, 208, 104, 1 }, + + /* x^512 + x^409 + x^307 + x^206 + x^102 + x^2 + 1 -- 95 */ + { 512, 409, 307, 206, 102, 2 }, + /* x^512 + x^409 + x^309 + x^205 + x^103 + x^2 + 1 -- 95 */ + { 512, 409, 309, 205, 103, 2 }, + + /* x^256 + x^205 + x^155 + x^101 + x^52 + x + 1 -- 125 */ + { 256, 205, 155, 101, 52, 1 }, + + /* x^128 + x^103 + x^78 + x^51 + x^27 + x^2 + 1 -- 70 */ + { 128, 103, 78, 51, 27, 2 }, + + /* x^64 + x^52 + x^39 + x^26 + x^14 + x + 1 -- 15 */ + { 64, 52, 39, 26, 14, 1 }, +#endif +}; + +#define POOLBITS poolwords*32 +#define POOLBYTES poolwords*4 + +/* + * For the purposes of better mixing, we use the CRC-32 polynomial as + * well to make a twisted Generalized Feedback Shift Reigster + * + * (See M. Matsumoto & Y. Kurita, 1992. Twisted GFSR generators. ACM + * Transactions on Modeling and Computer Simulation 2(3):179-194. + * Also see M. Matsumoto & Y. Kurita, 1994. Twisted GFSR generators + * II. ACM Transactions on Mdeling and Computer Simulation 4:254-266) + * + * Thanks to Colin Plumb for suggesting this. + * + * We have not analyzed the resultant polynomial to prove it primitive; + * in fact it almost certainly isn't. Nonetheless, the irreducible factors + * of a random large-degree polynomial over GF(2) are more than large enough + * that periodicity is not a concern. + * + * The input hash is much less sensitive than the output hash. All + * that we want of it is that it be a good non-cryptographic hash; + * i.e. it not produce collisions when fed "random" data of the sort + * we expect to see. As long as the pool state differs for different + * inputs, we have preserved the input entropy and done a good job. + * The fact that an intelligent attacker can construct inputs that + * will produce controlled alterations to the pool's state is not + * important because we don't consider such inputs to contribute any + * randomness. The only property we need with respect to them is that + * the attacker can't increase his/her knowledge of the pool's state. + * Since all additions are reversible (knowing the final state and the + * input, you can reconstruct the initial state), if an attacker has + * any uncertainty about the initial state, he/she can only shuffle + * that uncertainty about, but never cause any collisions (which would + * decrease the uncertainty). + * + * The chosen system lets the state of the pool be (essentially) the input + * modulo the generator polymnomial. Now, for random primitive polynomials, + * this is a universal class of hash functions, meaning that the chance + * of a collision is limited by the attacker's knowledge of the generator + * polynomail, so if it is chosen at random, an attacker can never force + * a collision. Here, we use a fixed polynomial, but we *can* assume that + * ###--> it is unknown to the processes generating the input entropy. <-### + * Because of this important property, this is a good, collision-resistant + * hash; hash collisions will occur no more often than chance. + */ + +/* + * Static global variables + */ +static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); +static struct fasync_struct *fasync; + +static bool debug; +module_param(debug, bool, 0644); +#define DEBUG_ENT(fmt, arg...) do { \ + if (debug) \ + printk(KERN_DEBUG "random %04d %04d %04d: " \ + fmt,\ + input_pool.entropy_count,\ + blocking_pool.entropy_count,\ + nonblocking_pool.entropy_count,\ + ## arg); } while (0) + +/********************************************************************** + * + * OS independent entropy store. Here are the functions which handle + * storing entropy in an entropy pool. + * + **********************************************************************/ + +struct entropy_store; +struct entropy_store { + /* read-only data: */ + struct poolinfo *poolinfo; + __u32 *pool; + const char *name; + struct entropy_store *pull; + int limit; + + /* read-write data: */ + spinlock_t lock; + unsigned add_ptr; + unsigned input_rotate; + int entropy_count; + int entropy_total; + unsigned int initialized:1; + bool last_data_init; + __u8 last_data[EXTRACT_SIZE]; +}; + +static __u32 input_pool_data[INPUT_POOL_WORDS]; +static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; +static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS]; + +static struct entropy_store input_pool = { + .poolinfo = &poolinfo_table[0], + .name = "input", + .limit = 1, + .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock), + .pool = input_pool_data +}; + +static struct entropy_store blocking_pool = { + .poolinfo = &poolinfo_table[1], + .name = "blocking", + .limit = 1, + .pull = &input_pool, + .lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock), + .pool = blocking_pool_data +}; + +static struct entropy_store nonblocking_pool = { + .poolinfo = &poolinfo_table[1], + .name = "nonblocking", + .pull = &input_pool, + .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock), + .pool = nonblocking_pool_data +}; + +static __u32 const twist_table[8] = { + 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, + 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; + +/* + * This function adds bytes into the entropy "pool". It does not + * update the entropy estimate. The caller should call + * credit_entropy_bits if this is appropriate. + * + * The pool is stirred with a primitive polynomial of the appropriate + * degree, and then twisted. We twist by three bits at a time because + * it's cheap to do so and helps slightly in the expected case where + * the entropy is concentrated in the low-order bits. + */ +static void _mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) +{ + unsigned long i, j, tap1, tap2, tap3, tap4, tap5; + int input_rotate; + int wordmask = r->poolinfo->poolwords - 1; + const char *bytes = in; + __u32 w; + + tap1 = r->poolinfo->tap1; + tap2 = r->poolinfo->tap2; + tap3 = r->poolinfo->tap3; + tap4 = r->poolinfo->tap4; + tap5 = r->poolinfo->tap5; + + smp_rmb(); + input_rotate = ACCESS_ONCE(r->input_rotate); + i = ACCESS_ONCE(r->add_ptr); + + /* mix one byte at a time to simplify size handling and churn faster */ + while (nbytes--) { + w = rol32(*bytes++, input_rotate & 31); + i = (i - 1) & wordmask; + + /* XOR in the various taps */ + w ^= r->pool[i]; + w ^= r->pool[(i + tap1) & wordmask]; + w ^= r->pool[(i + tap2) & wordmask]; + w ^= r->pool[(i + tap3) & wordmask]; + w ^= r->pool[(i + tap4) & wordmask]; + w ^= r->pool[(i + tap5) & wordmask]; + + /* Mix the result back in with a twist */ + r->pool[i] = (w >> 3) ^ twist_table[w & 7]; + + /* + * Normally, we add 7 bits of rotation to the pool. + * At the beginning of the pool, add an extra 7 bits + * rotation, so that successive passes spread the + * input bits across the pool evenly. + */ + input_rotate += i ? 7 : 14; + } + + ACCESS_ONCE(r->input_rotate) = input_rotate; + ACCESS_ONCE(r->add_ptr) = i; + smp_wmb(); + + if (out) + for (j = 0; j < 16; j++) + ((__u32 *)out)[j] = r->pool[(i - j) & wordmask]; +} + +static void __mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) +{ + trace_mix_pool_bytes_nolock(r->name, nbytes, _RET_IP_); + _mix_pool_bytes(r, in, nbytes, out); +} + +static void mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) +{ + unsigned long flags; + + trace_mix_pool_bytes(r->name, nbytes, _RET_IP_); + spin_lock_irqsave(&r->lock, flags); + _mix_pool_bytes(r, in, nbytes, out); + spin_unlock_irqrestore(&r->lock, flags); +} + +struct fast_pool { + __u32 pool[4]; + unsigned long last; + unsigned short count; + unsigned char rotate; + unsigned char last_timer_intr; +}; + +/* + * This is a fast mixing routine used by the interrupt randomness + * collector. It's hardcoded for an 128 bit pool and assumes that any + * locks that might be needed are taken by the caller. + */ +static void fast_mix(struct fast_pool *f, const void *in, int nbytes) +{ + const char *bytes = in; + __u32 w; + unsigned i = f->count; + unsigned input_rotate = f->rotate; + + while (nbytes--) { + w = rol32(*bytes++, input_rotate & 31) ^ f->pool[i & 3] ^ + f->pool[(i + 1) & 3]; + f->pool[i & 3] = (w >> 3) ^ twist_table[w & 7]; + input_rotate += (i++ & 3) ? 7 : 14; + } + f->count = i; + f->rotate = input_rotate; +} + +/* + * Credit (or debit) the entropy store with n bits of entropy + */ +static void credit_entropy_bits(struct entropy_store *r, int nbits) +{ + int entropy_count, orig; + + if (!nbits) + return; + + DEBUG_ENT("added %d entropy credits to %s\n", nbits, r->name); +retry: + entropy_count = orig = ACCESS_ONCE(r->entropy_count); + entropy_count += nbits; + + if (entropy_count < 0) { + DEBUG_ENT("negative entropy/overflow\n"); + entropy_count = 0; + } else if (entropy_count > r->poolinfo->POOLBITS) + entropy_count = r->poolinfo->POOLBITS; + if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) + goto retry; + + if (!r->initialized && nbits > 0) { + r->entropy_total += nbits; + if (r->entropy_total > 128) + r->initialized = 1; + } + + trace_credit_entropy_bits(r->name, nbits, entropy_count, + r->entropy_total, _RET_IP_); + + /* should we wake readers? */ + if (r == &input_pool && entropy_count >= random_read_wakeup_thresh) { + wake_up_interruptible(&random_read_wait); + kill_fasync(&fasync, SIGIO, POLL_IN); + } +} + +/********************************************************************* + * + * Entropy input management + * + *********************************************************************/ + +/* There is one of these per entropy source */ +struct timer_rand_state { + cycles_t last_time; + long last_delta, last_delta2; + unsigned dont_count_entropy:1; +}; + +/* + * Add device- or boot-specific data to the input and nonblocking + * pools to help initialize them to unique values. + * + * None of this adds any entropy, it is meant to avoid the + * problem of the nonblocking pool having similar initial state + * across largely identical devices. + */ +void add_device_randomness(const void *buf, unsigned int size) +{ + unsigned long time = get_cycles() ^ jiffies; + + mix_pool_bytes(&input_pool, buf, size, NULL); + mix_pool_bytes(&input_pool, &time, sizeof(time), NULL); + mix_pool_bytes(&nonblocking_pool, buf, size, NULL); + mix_pool_bytes(&nonblocking_pool, &time, sizeof(time), NULL); +} +EXPORT_SYMBOL(add_device_randomness); + +static struct timer_rand_state input_timer_state; + +/* + * This function adds entropy to the entropy "pool" by using timing + * delays. It uses the timer_rand_state structure to make an estimate + * of how many bits of entropy this call has added to the pool. + * + * The number "num" is also added to the pool - it should somehow describe + * the type of event which just happened. This is currently 0-255 for + * keyboard scan codes, and 256 upwards for interrupts. + * + */ +static void add_timer_randomness(struct timer_rand_state *state, unsigned num) +{ + struct { + long jiffies; + unsigned cycles; + unsigned num; + } sample; + long delta, delta2, delta3; + + preempt_disable(); + /* if over the trickle threshold, use only 1 in 4096 samples */ + if (input_pool.entropy_count > trickle_thresh && + ((__this_cpu_inc_return(trickle_count) - 1) & 0xfff)) + goto out; + + sample.jiffies = jiffies; + sample.cycles = get_cycles(); + sample.num = num; + mix_pool_bytes(&input_pool, &sample, sizeof(sample), NULL); + + /* + * Calculate number of bits of randomness we probably added. + * We take into account the first, second and third-order deltas + * in order to make our estimate. + */ + + if (!state->dont_count_entropy) { + delta = sample.jiffies - state->last_time; + state->last_time = sample.jiffies; + + delta2 = delta - state->last_delta; + state->last_delta = delta; + + delta3 = delta2 - state->last_delta2; + state->last_delta2 = delta2; + + if (delta < 0) + delta = -delta; + if (delta2 < 0) + delta2 = -delta2; + if (delta3 < 0) + delta3 = -delta3; + if (delta > delta2) + delta = delta2; + if (delta > delta3) + delta = delta3; + + /* + * delta is now minimum absolute delta. + * Round down by 1 bit on general principles, + * and limit entropy entimate to 12 bits. + */ + credit_entropy_bits(&input_pool, + min_t(int, fls(delta>>1), 11)); + } +out: + preempt_enable(); +} + +void add_input_randomness(unsigned int type, unsigned int code, + unsigned int value) +{ + static unsigned char last_value; + + /* ignore autorepeat and the like */ + if (value == last_value) + return; + + DEBUG_ENT("input event\n"); + last_value = value; + add_timer_randomness(&input_timer_state, + (type << 4) ^ code ^ (code >> 4) ^ value); +} +EXPORT_SYMBOL_GPL(add_input_randomness); + +static DEFINE_PER_CPU(struct fast_pool, irq_randomness); + +void add_interrupt_randomness(int irq, int irq_flags) +{ + struct entropy_store *r; + struct fast_pool *fast_pool = &__get_cpu_var(irq_randomness); + struct pt_regs *regs = get_irq_regs(); + unsigned long now = jiffies; + __u32 input[4], cycles = get_cycles(); + + input[0] = cycles ^ jiffies; + input[1] = irq; + if (regs) { + __u64 ip = instruction_pointer(regs); + input[2] = ip; + input[3] = ip >> 32; + } + + fast_mix(fast_pool, input, sizeof(input)); + + if ((fast_pool->count & 1023) && + !time_after(now, fast_pool->last + HZ)) + return; + + fast_pool->last = now; + + r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; + __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL); + /* + * If we don't have a valid cycle counter, and we see + * back-to-back timer interrupts, then skip giving credit for + * any entropy. + */ + if (cycles == 0) { + if (irq_flags & __IRQF_TIMER) { + if (fast_pool->last_timer_intr) + return; + fast_pool->last_timer_intr = 1; + } else + fast_pool->last_timer_intr = 0; + } + credit_entropy_bits(r, 1); +} + +#ifdef CONFIG_BLOCK +void add_disk_randomness(struct gendisk *disk) +{ + if (!disk || !disk->random) + return; + /* first major is 1, so we get >= 0x200 here */ + DEBUG_ENT("disk event %d:%d\n", + MAJOR(disk_devt(disk)), MINOR(disk_devt(disk))); + + add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); +} +#endif + +/********************************************************************* + * + * Entropy extraction routines + * + *********************************************************************/ + +static ssize_t extract_entropy(struct entropy_store *r, void *buf, + size_t nbytes, int min, int rsvd); + +/* + * This utility inline function is responsible for transferring entropy + * from the primary pool to the secondary extraction pool. We make + * sure we pull enough for a 'catastrophic reseed'. + */ +static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) +{ + __u32 tmp[OUTPUT_POOL_WORDS]; + + if (r->pull && r->entropy_count < nbytes * 8 && + r->entropy_count < r->poolinfo->POOLBITS) { + /* If we're limited, always leave two wakeup worth's BITS */ + int rsvd = r->limit ? 0 : random_read_wakeup_thresh/4; + int bytes = nbytes; + + /* pull at least as many as BYTES as wakeup BITS */ + bytes = max_t(int, bytes, random_read_wakeup_thresh / 8); + /* but never more than the buffer size */ + bytes = min_t(int, bytes, sizeof(tmp)); + + DEBUG_ENT("going to reseed %s with %d bits " + "(%zu of %d requested)\n", + r->name, bytes * 8, nbytes * 8, r->entropy_count); + + bytes = extract_entropy(r->pull, tmp, bytes, + random_read_wakeup_thresh / 8, rsvd); + mix_pool_bytes(r, tmp, bytes, NULL); + credit_entropy_bits(r, bytes*8); + } +} + +/* + * These functions extracts randomness from the "entropy pool", and + * returns it in a buffer. + * + * The min parameter specifies the minimum amount we can pull before + * failing to avoid races that defeat catastrophic reseeding while the + * reserved parameter indicates how much entropy we must leave in the + * pool after each pull to avoid starving other readers. + * + * Note: extract_entropy() assumes that .poolwords is a multiple of 16 words. + */ + +static size_t account(struct entropy_store *r, size_t nbytes, int min, + int reserved) +{ + unsigned long flags; + int wakeup_write = 0; + + /* Hold lock while accounting */ + spin_lock_irqsave(&r->lock, flags); + + BUG_ON(r->entropy_count > r->poolinfo->POOLBITS); + DEBUG_ENT("trying to extract %zu bits from %s\n", + nbytes * 8, r->name); + + /* Can we pull enough? */ + if (r->entropy_count / 8 < min + reserved) { + nbytes = 0; + } else { + int entropy_count, orig; +retry: + entropy_count = orig = ACCESS_ONCE(r->entropy_count); + /* If limited, never pull more than available */ + if (r->limit && nbytes + reserved >= entropy_count / 8) + nbytes = entropy_count/8 - reserved; + + if (entropy_count / 8 >= nbytes + reserved) { + entropy_count -= nbytes*8; + if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) + goto retry; + } else { + entropy_count = reserved; + if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) + goto retry; + } + + if (entropy_count < random_write_wakeup_thresh) + wakeup_write = 1; + } + + DEBUG_ENT("debiting %zu entropy credits from %s%s\n", + nbytes * 8, r->name, r->limit ? "" : " (unlimited)"); + + spin_unlock_irqrestore(&r->lock, flags); + + if (wakeup_write) { + wake_up_interruptible(&random_write_wait); + kill_fasync(&fasync, SIGIO, POLL_OUT); + } + + return nbytes; +} + +static void extract_buf(struct entropy_store *r, __u8 *out) +{ + int i; + union { + __u32 w[5]; + unsigned long l[LONGS(EXTRACT_SIZE)]; + } hash; + __u32 workspace[SHA_WORKSPACE_WORDS]; + __u8 extract[64]; + unsigned long flags; + + /* Generate a hash across the pool, 16 words (512 bits) at a time */ + sha_init(hash.w); + spin_lock_irqsave(&r->lock, flags); + for (i = 0; i < r->poolinfo->poolwords; i += 16) + sha_transform(hash.w, (__u8 *)(r->pool + i), workspace); + + /* + * We mix the hash back into the pool to prevent backtracking + * attacks (where the attacker knows the state of the pool + * plus the current outputs, and attempts to find previous + * ouputs), unless the hash function can be inverted. By + * mixing at least a SHA1 worth of hash data back, we make + * brute-forcing the feedback as hard as brute-forcing the + * hash. + */ + __mix_pool_bytes(r, hash.w, sizeof(hash.w), extract); + spin_unlock_irqrestore(&r->lock, flags); + + /* + * To avoid duplicates, we atomically extract a portion of the + * pool while mixing, and hash one final time. + */ + sha_transform(hash.w, extract, workspace); + memset(extract, 0, sizeof(extract)); + memset(workspace, 0, sizeof(workspace)); + + /* + * In case the hash function has some recognizable output + * pattern, we fold it in half. Thus, we always feed back + * twice as much data as we output. + */ + hash.w[0] ^= hash.w[3]; + hash.w[1] ^= hash.w[4]; + hash.w[2] ^= rol32(hash.w[2], 16); + + /* + * If we have a architectural hardware random number + * generator, mix that in, too. + */ + for (i = 0; i < LONGS(EXTRACT_SIZE); i++) { + unsigned long v; + if (!arch_get_random_long(&v)) + break; + hash.l[i] ^= v; + } + + memcpy(out, &hash, EXTRACT_SIZE); + memset(&hash, 0, sizeof(hash)); +} + +static ssize_t extract_entropy(struct entropy_store *r, void *buf, + size_t nbytes, int min, int reserved) +{ + ssize_t ret = 0, i; + __u8 tmp[EXTRACT_SIZE]; + unsigned long flags; + + /* if last_data isn't primed, we need EXTRACT_SIZE extra bytes */ + if (fips_enabled) { + spin_lock_irqsave(&r->lock, flags); + if (!r->last_data_init) { + r->last_data_init = true; + spin_unlock_irqrestore(&r->lock, flags); + trace_extract_entropy(r->name, EXTRACT_SIZE, + r->entropy_count, _RET_IP_); + xfer_secondary_pool(r, EXTRACT_SIZE); + extract_buf(r, tmp); + spin_lock_irqsave(&r->lock, flags); + memcpy(r->last_data, tmp, EXTRACT_SIZE); + } + spin_unlock_irqrestore(&r->lock, flags); + } + + trace_extract_entropy(r->name, nbytes, r->entropy_count, _RET_IP_); + xfer_secondary_pool(r, nbytes); + nbytes = account(r, nbytes, min, reserved); + + while (nbytes) { + extract_buf(r, tmp); + + if (fips_enabled) { + spin_lock_irqsave(&r->lock, flags); + if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) + panic("Hardware RNG duplicated output!\n"); + memcpy(r->last_data, tmp, EXTRACT_SIZE); + spin_unlock_irqrestore(&r->lock, flags); + } + i = min_t(int, nbytes, EXTRACT_SIZE); + memcpy(buf, tmp, i); + nbytes -= i; + buf += i; + ret += i; + } + + /* Wipe data just returned from memory */ + memset(tmp, 0, sizeof(tmp)); + + return ret; +} + +static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, + size_t nbytes) +{ + ssize_t ret = 0, i; + __u8 tmp[EXTRACT_SIZE]; + + trace_extract_entropy_user(r->name, nbytes, r->entropy_count, _RET_IP_); + xfer_secondary_pool(r, nbytes); + nbytes = account(r, nbytes, 0, 0); + + while (nbytes) { + if (need_resched()) { + if (signal_pending(current)) { + if (ret == 0) + ret = -ERESTARTSYS; + break; + } + schedule(); + } + + extract_buf(r, tmp); + i = min_t(int, nbytes, EXTRACT_SIZE); + if (copy_to_user(buf, tmp, i)) { + ret = -EFAULT; + break; + } + + nbytes -= i; + buf += i; + ret += i; + } + + /* Wipe data just returned from memory */ + memset(tmp, 0, sizeof(tmp)); + + return ret; +} + +/* + * This function is the exported kernel interface. It returns some + * number of good random numbers, suitable for key generation, seeding + * TCP sequence numbers, etc. It does not use the hw random number + * generator, if available; use get_random_bytes_arch() for that. + */ +void get_random_bytes(void *buf, int nbytes) +{ + extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); +} +EXPORT_SYMBOL(get_random_bytes); + +/* + * This function will use the architecture-specific hardware random + * number generator if it is available. The arch-specific hw RNG will + * almost certainly be faster than what we can do in software, but it + * is impossible to verify that it is implemented securely (as + * opposed, to, say, the AES encryption of a sequence number using a + * key known by the NSA). So it's useful if we need the speed, but + * only if we're willing to trust the hardware manufacturer not to + * have put in a back door. + */ +void get_random_bytes_arch(void *buf, int nbytes) +{ + char *p = buf; + + trace_get_random_bytes(nbytes, _RET_IP_); + while (nbytes) { + unsigned long v; + int chunk = min(nbytes, (int)sizeof(unsigned long)); + + if (!arch_get_random_long(&v)) + break; + + memcpy(p, &v, chunk); + p += chunk; + nbytes -= chunk; + } + + if (nbytes) + extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); +} +EXPORT_SYMBOL(get_random_bytes_arch); + + +/* + * init_std_data - initialize pool with system data + * + * @r: pool to initialize + * + * This function clears the pool's entropy count and mixes some system + * data into the pool to prepare it for use. The pool is not cleared + * as that can only decrease the entropy in the pool. + */ +static void init_std_data(struct entropy_store *r) +{ + int i; + ktime_t now = ktime_get_real(); + unsigned long rv; + + r->entropy_count = 0; + r->entropy_total = 0; + r->last_data_init = false; + mix_pool_bytes(r, &now, sizeof(now), NULL); + for (i = r->poolinfo->POOLBYTES; i > 0; i -= sizeof(rv)) { + if (!arch_get_random_long(&rv)) + break; + mix_pool_bytes(r, &rv, sizeof(rv), NULL); + } + mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL); +} + +/* + * Note that setup_arch() may call add_device_randomness() + * long before we get here. This allows seeding of the pools + * with some platform dependent data very early in the boot + * process. But it limits our options here. We must use + * statically allocated structures that already have all + * initializations complete at compile time. We should also + * take care not to overwrite the precious per platform data + * we were given. + */ +static int rand_initialize(void) +{ + init_std_data(&input_pool); + init_std_data(&blocking_pool); + init_std_data(&nonblocking_pool); + return 0; +} +module_init(rand_initialize); + +#ifdef CONFIG_BLOCK +void rand_initialize_disk(struct gendisk *disk) +{ + struct timer_rand_state *state; + + /* + * If kzalloc returns null, we just won't use that entropy + * source. + */ + state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL); + if (state) + disk->random = state; +} +#endif + +static ssize_t +random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + ssize_t n, retval = 0, count = 0; + + if (nbytes == 0) + return 0; + + while (nbytes > 0) { + n = nbytes; + if (n > SEC_XFER_SIZE) + n = SEC_XFER_SIZE; + + DEBUG_ENT("reading %zu bits\n", n*8); + + n = extract_entropy_user(&blocking_pool, buf, n); + + if (n < 0) { + retval = n; + break; + } + + DEBUG_ENT("read got %zd bits (%zd still needed)\n", + n*8, (nbytes-n)*8); + + if (n == 0) { + if (file->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + break; + } + + DEBUG_ENT("sleeping?\n"); + + wait_event_interruptible(random_read_wait, + input_pool.entropy_count >= + random_read_wakeup_thresh); + + DEBUG_ENT("awake\n"); + + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + + continue; + } + + count += n; + buf += n; + nbytes -= n; + break; /* This break makes the device work */ + /* like a named pipe */ + } + + return (count ? count : retval); +} + +static ssize_t +urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + return extract_entropy_user(&nonblocking_pool, buf, nbytes); +} + +static unsigned int +random_poll(struct file *file, poll_table * wait) +{ + unsigned int mask; + + poll_wait(file, &random_read_wait, wait); + poll_wait(file, &random_write_wait, wait); + mask = 0; + if (input_pool.entropy_count >= random_read_wakeup_thresh) + mask |= POLLIN | POLLRDNORM; + if (input_pool.entropy_count < random_write_wakeup_thresh) + mask |= POLLOUT | POLLWRNORM; + return mask; +} + +static int +write_pool(struct entropy_store *r, const char __user *buffer, size_t count) +{ + size_t bytes; + __u32 buf[16]; + const char __user *p = buffer; + + while (count > 0) { + bytes = min(count, sizeof(buf)); + if (copy_from_user(&buf, p, bytes)) + return -EFAULT; + + count -= bytes; + p += bytes; + + mix_pool_bytes(r, buf, bytes, NULL); + cond_resched(); + } + + return 0; +} + +static ssize_t random_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + size_t ret; + + ret = write_pool(&blocking_pool, buffer, count); + if (ret) + return ret; + ret = write_pool(&nonblocking_pool, buffer, count); + if (ret) + return ret; + + return (ssize_t)count; +} + +static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + int size, ent_count; + int __user *p = (int __user *)arg; + int retval; + + switch (cmd) { + case RNDGETENTCNT: + /* inherently racy, no point locking */ + if (put_user(input_pool.entropy_count, p)) + return -EFAULT; + return 0; + case RNDADDTOENTCNT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ent_count, p)) + return -EFAULT; + credit_entropy_bits(&input_pool, ent_count); + return 0; + case RNDADDENTROPY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ent_count, p++)) + return -EFAULT; + if (ent_count < 0) + return -EINVAL; + if (get_user(size, p++)) + return -EFAULT; + retval = write_pool(&input_pool, (const char __user *)p, + size); + if (retval < 0) + return retval; + credit_entropy_bits(&input_pool, ent_count); + return 0; + case RNDZAPENTCNT: + case RNDCLEARPOOL: + /* Clear the entropy pool counters. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + rand_initialize(); + return 0; + default: + return -EINVAL; + } +} + +static int random_fasync(int fd, struct file *filp, int on) +{ + return fasync_helper(fd, filp, on, &fasync); +} + +const struct file_operations random_fops = { + .read = random_read, + .write = random_write, + .poll = random_poll, + .unlocked_ioctl = random_ioctl, + .fasync = random_fasync, + .llseek = noop_llseek, +}; + +const struct file_operations urandom_fops = { + .read = urandom_read, + .write = random_write, + .unlocked_ioctl = random_ioctl, + .fasync = random_fasync, + .llseek = noop_llseek, +}; + +/*************************************************************** + * Random UUID interface + * + * Used here for a Boot ID, but can be useful for other kernel + * drivers. + ***************************************************************/ + +/* + * Generate random UUID + */ +void generate_random_uuid(unsigned char uuid_out[16]) +{ + get_random_bytes(uuid_out, 16); + /* Set UUID version to 4 --- truly random generation */ + uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40; + /* Set the UUID variant to DCE */ + uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_uuid); + +/******************************************************************** + * + * Sysctl interface + * + ********************************************************************/ + +#ifdef CONFIG_SYSCTL + +#include <linux/sysctl.h> + +static int min_read_thresh = 8, min_write_thresh; +static int max_read_thresh = INPUT_POOL_WORDS * 32; +static int max_write_thresh = INPUT_POOL_WORDS * 32; +static char sysctl_bootid[16]; + +/* + * These functions is used to return both the bootid UUID, and random + * UUID. The difference is in whether table->data is NULL; if it is, + * then a new UUID is generated and returned to the user. + * + * If the user accesses this via the proc interface, it will be returned + * as an ASCII string in the standard UUID format. If accesses via the + * sysctl system call, it is returned as 16 bytes of binary data. + */ +static int proc_do_uuid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table fake_table; + unsigned char buf[64], tmp_uuid[16], *uuid; + + uuid = table->data; + if (!uuid) { + uuid = tmp_uuid; + generate_random_uuid(uuid); + } else { + static DEFINE_SPINLOCK(bootid_spinlock); + + spin_lock(&bootid_spinlock); + if (!uuid[8]) + generate_random_uuid(uuid); + spin_unlock(&bootid_spinlock); + } + + sprintf(buf, "%pU", uuid); + + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + + return proc_dostring(&fake_table, write, buffer, lenp, ppos); +} + +static int sysctl_poolsize = INPUT_POOL_WORDS * 32; +extern struct ctl_table random_table[]; +struct ctl_table random_table[] = { + { + .procname = "poolsize", + .data = &sysctl_poolsize, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "entropy_avail", + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + .data = &input_pool.entropy_count, + }, + { + .procname = "read_wakeup_threshold", + .data = &random_read_wakeup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_read_thresh, + .extra2 = &max_read_thresh, + }, + { + .procname = "write_wakeup_threshold", + .data = &random_write_wakeup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_write_thresh, + .extra2 = &max_write_thresh, + }, + { + .procname = "boot_id", + .data = &sysctl_bootid, + .maxlen = 16, + .mode = 0444, + .proc_handler = proc_do_uuid, + }, + { + .procname = "uuid", + .maxlen = 16, + .mode = 0444, + .proc_handler = proc_do_uuid, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static u32 random_int_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; + +static int __init random_int_secret_init(void) +{ + get_random_bytes(random_int_secret, sizeof(random_int_secret)); + return 0; +} +late_initcall(random_int_secret_init); + +/* + * Get a random word for internal kernel use only. Similar to urandom but + * with the goal of minimal entropy pool depletion. As a result, the random + * value is not cryptographically secure but for several uses the cost of + * depleting entropy is too high + */ +static DEFINE_PER_CPU(__u32 [MD5_DIGEST_WORDS], get_random_int_hash); +unsigned int get_random_int(void) +{ + __u32 *hash; + unsigned int ret; + + if (arch_get_random_int(&ret)) + return ret; + + hash = get_cpu_var(get_random_int_hash); + + hash[0] += current->pid + jiffies + get_cycles(); + md5_transform(hash, random_int_secret); + ret = hash[0]; + put_cpu_var(get_random_int_hash); + + return ret; +} +EXPORT_SYMBOL(get_random_int); + +/* + * randomize_range() returns a start address such that + * + * [...... <range> .....] + * start end + * + * a <range> with size "len" starting at the return value is inside in the + * area defined by [start, end], but is otherwise randomized. + */ +unsigned long +randomize_range(unsigned long start, unsigned long end, unsigned long len) +{ + unsigned long range = end - len - start; + + if (end <= start + len) + return 0; + return PAGE_ALIGN(get_random_int() % range + start); +} diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h new file mode 100644 index 0000000..14a8ff2 --- /dev/null +++ b/include/linux/sysctl.h @@ -0,0 +1,217 @@ +/* + * sysctl.h: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * + **************************************************************** + **************************************************************** + ** + ** WARNING: + ** The values in this file are exported to user space via + ** the sysctl() binary interface. Do *NOT* change the + ** numbering of any existing values here, and do not change + ** any numbers within any one set of values. If you have to + ** redefine an existing interface, use a new number for it. + ** The kernel will then return -ENOTDIR to any application using + ** the old binary interface. + ** + **************************************************************** + **************************************************************** + */ +#ifndef _LINUX_SYSCTL_H +#define _LINUX_SYSCTL_H + +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/wait.h> +#include <linux/rbtree.h> +#include <uapi/linux/sysctl.h> + +/* For the /proc/sys support */ +struct ctl_table; +struct nsproxy; +struct ctl_table_root; +struct ctl_table_header; +struct ctl_dir; + +typedef struct ctl_table ctl_table; + +typedef int proc_handler (struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + +extern int proc_dostring(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_dointvec(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_minmax(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_jiffies(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_doulongvec_minmax(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, + void __user *, size_t *, loff_t *); +extern int proc_do_large_bitmap(struct ctl_table *, int, + void __user *, size_t *, loff_t *); + +/* + * Register a set of sysctl names by calling register_sysctl_table + * with an initialised array of struct ctl_table's. An entry with + * NULL procname terminates the table. table->de will be + * set up by the registration and need not be initialised in advance. + * + * sysctl names can be mirrored automatically under /proc/sys. The + * procname supplied controls /proc naming. + * + * The table's mode will be honoured both for sys_sysctl(2) and + * proc-fs access. + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. A + * null procname disables /proc mirroring at this node. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases. + */ + +/* Support for userspace poll() to watch for changes */ +struct ctl_table_poll { + atomic_t event; + wait_queue_head_t wait; +}; + +static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) +{ + return (void *)(unsigned long)atomic_read(&poll->event); +} + +#define __CTL_TABLE_POLL_INITIALIZER(name) { \ + .event = ATOMIC_INIT(0), \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } + +#define DEFINE_CTL_TABLE_POLL(name) \ + struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name) + +/* A sysctl table is an array of struct ctl_table: */ +struct ctl_table +{ + const char *procname; /* Text ID for /proc/sys, or zero */ + void *data; + int maxlen; + umode_t mode; + struct ctl_table *child; /* Deprecated */ + proc_handler *proc_handler; /* Callback for text formatting */ + struct ctl_table_poll *poll; + void *extra1; + void *extra2; +}; + +struct ctl_node { + struct rb_node node; + struct ctl_table_header *header; +}; + +/* struct ctl_table_header is used to maintain dynamic lists of + struct ctl_table trees. */ +struct ctl_table_header +{ + union { + struct { + struct ctl_table *ctl_table; + int used; + int count; + int nreg; + }; + struct rcu_head rcu; + }; + struct completion *unregistering; + struct ctl_table *ctl_table_arg; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_dir *parent; + struct ctl_node *node; +}; + +struct ctl_dir { + /* Header must be at the start of ctl_dir */ + struct ctl_table_header header; + struct rb_root root; +}; + +struct ctl_table_set { + int (*is_seen)(struct ctl_table_set *); + struct ctl_dir dir; +}; + +struct ctl_table_root { + struct ctl_table_set default_set; + struct ctl_table_set *(*lookup)(struct ctl_table_root *root, + struct nsproxy *namespaces); + int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); +}; + +/* struct ctl_path describes where in the hierarchy a table is added */ +struct ctl_path { + const char *procname; +}; + +#ifdef CONFIG_SYSCTL + +void proc_sys_poll_notify(struct ctl_table_poll *poll); + +extern void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)); +extern void retire_sysctl_set(struct ctl_table_set *set); + +void register_sysctl_root(struct ctl_table_root *root); +struct ctl_table_header *__register_sysctl_table( + struct ctl_table_set *set, + const char *path, struct ctl_table *table); +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_set *set, + const struct ctl_path *path, struct ctl_table *table); +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table); +struct ctl_table_header *register_sysctl_table(struct ctl_table * table); +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table); + +void unregister_sysctl_table(struct ctl_table_header * table); + +extern int sysctl_init(void); +#else /* CONFIG_SYSCTL */ +static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) +{ + return NULL; +} + +static inline struct ctl_table_header *register_sysctl_paths( + const struct ctl_path *path, struct ctl_table *table) +{ + return NULL; +} + +static inline void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +static inline void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)) +{ +} + +#endif /* CONFIG_SYSCTL */ + +#endif /* _LINUX_SYSCTL_H */ diff --git a/include/trace/events/random.h b/include/trace/events/random.h new file mode 100644 index 0000000..422df19 --- /dev/null +++ b/include/trace/events/random.h @@ -0,0 +1,134 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM random + +#if !defined(_TRACE_RANDOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RANDOM_H + +#include <linux/writeback.h> +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(random__mix_pool_bytes, + TP_PROTO(const char *pool_name, int bytes, unsigned long IP), + + TP_ARGS(pool_name, bytes, IP), + + TP_STRUCT__entry( + __field( const char *, pool_name ) + __field( int, bytes ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->bytes = bytes; + __entry->IP = IP; + ), + + TP_printk("%s pool: bytes %d caller %pF", + __entry->pool_name, __entry->bytes, (void *)__entry->IP) +); + +DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes, + TP_PROTO(const char *pool_name, int bytes, unsigned long IP), + + TP_ARGS(pool_name, bytes, IP) +); + +DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes_nolock, + TP_PROTO(const char *pool_name, int bytes, unsigned long IP), + + TP_ARGS(pool_name, bytes, IP) +); + +TRACE_EVENT(credit_entropy_bits, + TP_PROTO(const char *pool_name, int bits, int entropy_count, + int entropy_total, unsigned long IP), + + TP_ARGS(pool_name, bits, entropy_count, entropy_total, IP), + + TP_STRUCT__entry( + __field( const char *, pool_name ) + __field( int, bits ) + __field( int, entropy_count ) + __field( int, entropy_total ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->bits = bits; + __entry->entropy_count = entropy_count; + __entry->entropy_total = entropy_total; + __entry->IP = IP; + ), + + TP_printk("%s pool: bits %d entropy_count %d entropy_total %d " + "caller %pF", __entry->pool_name, __entry->bits, + __entry->entropy_count, __entry->entropy_total, + (void *)__entry->IP) +); + +TRACE_EVENT(get_random_bytes, + TP_PROTO(int nbytes, unsigned long IP), + + TP_ARGS(nbytes, IP), + + TP_STRUCT__entry( + __field( int, nbytes ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->nbytes = nbytes; + __entry->IP = IP; + ), + + TP_printk("nbytes %d caller %pF", __entry->nbytes, (void *)__entry->IP) +); + +DECLARE_EVENT_CLASS(random__extract_entropy, + TP_PROTO(const char *pool_name, int nbytes, int entropy_count, + unsigned long IP), + + TP_ARGS(pool_name, nbytes, entropy_count, IP), + + TP_STRUCT__entry( + __field( const char *, pool_name ) + __field( int, nbytes ) + __field( int, entropy_count ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->nbytes = nbytes; + __entry->entropy_count = entropy_count; + __entry->IP = IP; + ), + + TP_printk("%s pool: nbytes %d entropy_count %d caller %pF", + __entry->pool_name, __entry->nbytes, __entry->entropy_count, + (void *)__entry->IP) +); + + +DEFINE_EVENT(random__extract_entropy, extract_entropy, + TP_PROTO(const char *pool_name, int nbytes, int entropy_count, + unsigned long IP), + + TP_ARGS(pool_name, nbytes, entropy_count, IP) +); + +DEFINE_EVENT(random__extract_entropy, extract_entropy_user, + TP_PROTO(const char *pool_name, int nbytes, int entropy_count, + unsigned long IP), + + TP_ARGS(pool_name, nbytes, entropy_count, IP) +); + + + +#endif /* _TRACE_RANDOM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/sysctl.c b/kernel/sysctl.c new file mode 100644 index 0000000..07f6fc4 --- /dev/null +++ b/kernel/sysctl.c @@ -0,0 +1,2671 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * Added /proc support, Dec 1995 + * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. + * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. + * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. + * Dynamic registration fixes, Stephen Tweedie. + * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. + * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris + * Horn. + * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. + * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. + * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill + * Wendling. + * The list_for_each() macro wasn't appropriate for the sysctl loop. + * Removed it and replaced it with older style, 03/23/00, Bill Wendling + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/bitmap.h> +#include <linux/signal.h> +#include <linux/printk.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <linux/ctype.h> +#include <linux/kmemcheck.h> +#include <linux/kmemleak.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/kobject.h> +#include <linux/net.h> +#include <linux/sysrq.h> +#include <linux/highuid.h> +#include <linux/writeback.h> +#include <linux/ratelimit.h> +#include <linux/compaction.h> +#include <linux/hugetlb.h> +#include <linux/initrd.h> +#include <linux/key.h> +#include <linux/times.h> +#include <linux/limits.h> +#include <linux/dcache.h> +#include <linux/dnotify.h> +#include <linux/syscalls.h> +#include <linux/vmstat.h> +#include <linux/nfs_fs.h> +#include <linux/acpi.h> +#include <linux/reboot.h> +#include <linux/ftrace.h> +#include <linux/perf_event.h> +#include <linux/kprobes.h> +#include <linux/pipe_fs_i.h> +#include <linux/oom.h> +#include <linux/kmod.h> +#include <linux/capability.h> +#include <linux/binfmts.h> +#include <linux/sched/sysctl.h> + +#include <asm/uaccess.h> +#include <asm/processor.h> + +#ifdef CONFIG_X86 +#include <asm/nmi.h> +#include <asm/stacktrace.h> +#include <asm/io.h> +#endif +#ifdef CONFIG_SPARC +#include <asm/setup.h> +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT +#include <linux/acct.h> +#endif +#ifdef CONFIG_RT_MUTEXES +#include <linux/rtmutex.h> +#endif +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) +#include <linux/lockdep.h> +#endif +#ifdef CONFIG_CHR_DEV_SG +#include <scsi/sg.h> +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR +#include <linux/nmi.h> +#endif + + +#if defined(CONFIG_SYSCTL) + +/* External variables not in a header file. */ +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern int max_threads; +extern int suid_dumpable; +#ifdef CONFIG_COREDUMP +extern int core_uses_pid; +extern char core_pattern[]; +extern unsigned int core_pipe_limit; +#endif +extern int pid_max; +extern int pid_max_min, pid_max_max; +extern int percpu_pagelist_fraction; +extern int compat_log; +extern int latencytop_enabled; +extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif +#ifdef CONFIG_BLOCK +extern int blk_iopoll_enabled; +#endif + +/* Constants used for minimum and maximum */ +#ifdef CONFIG_LOCKUP_DETECTOR +static int sixty = 60; +#endif + +static int zero; +static int __maybe_unused one = 1; +static int __maybe_unused two = 2; +static int __maybe_unused three = 3; +static unsigned long one_ul = 1; +static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +static int maxolduid = 65535; +static int minolduid; +static int min_percpu_pagelist_fract = 8; + +static int ngroups_max = NGROUPS_MAX; +static const int cap_last_cap = CAP_LAST_CAP; + +#ifdef CONFIG_INOTIFY_USER +#include <linux/inotify.h> +#endif +#ifdef CONFIG_SPARC +#endif + +#ifdef CONFIG_SPARC64 +extern int sysctl_tsb_ratio; +#endif + +#ifdef __hppa__ +extern int pwrsw_enabled; +#endif + +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW +extern int unaligned_enabled; +#endif + +#ifdef CONFIG_IA64 +extern int unaligned_dump_stack; +#endif + +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN +extern int no_unaligned_warning; +#endif + +#ifdef CONFIG_PROC_SYSCTL +static int proc_do_cad_pid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + +#ifdef CONFIG_PRINTK +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COREDUMP +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +/* Note: sysrq code uses it's own private copy */ +static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; + +static int sysrq_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int error; + + error = proc_dointvec(table, write, buffer, lenp, ppos); + if (error) + return error; + + if (write) + sysrq_toggle_support(__sysrq_enabled); + + return 0; +} + +#endif + +static struct ctl_table kern_table[]; +static struct ctl_table vm_table[]; +static struct ctl_table fs_table[]; +static struct ctl_table debug_table[]; +static struct ctl_table dev_table[]; +extern struct ctl_table random_table[]; +#ifdef CONFIG_EPOLL +extern struct ctl_table epoll_table[]; +#endif + +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +int sysctl_legacy_va_layout; +#endif + +/* The default sysctl tables: */ + +static struct ctl_table sysctl_base_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; + +#ifdef CONFIG_SCHED_DEBUG +static int min_sched_granularity_ns = 100000; /* 100 usecs */ +static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_wakeup_granularity_ns; /* 0 usecs */ +static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ + +#ifdef CONFIG_COMPACTION +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; +#endif + +static struct ctl_table kern_table[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, +#ifdef CONFIG_SMP + { + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { + .procname = "sched_migration_cost_ns", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_time_avg_ms", + .data = &sysctl_sched_time_avg, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_shares_window_ns", + .data = &sysctl_sched_shares_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, +#endif +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_COREDUMP + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, +#endif +#ifdef CONFIG_LATENCYTOP + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BLK_DEV_INITRD + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SPARC + { + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SPARC64 + { + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef __hppa__ + { + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW + { + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_FUNCTION_TRACER + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, +#endif +#ifdef CONFIG_STACK_TRACER + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +#endif +#ifdef CONFIG_TRACING + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MODULES + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif + + { + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + +#ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq", + .data = &__sysrq_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = sysrq_sysctl_handler, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif + { + .procname = "threads-max", + .data = &max_threads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "random", + .mode = 0555, + .child = random_table, + }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_S390 +#ifdef CONFIG_MATHEMU + { + .procname = "ieee_emulation_warnings", + .data = &sysctl_ieee_emulation_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#if defined CONFIG_PRINTK + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = &zero, + .extra2 = &two, + }, +#endif + { + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "cap_last_cap", + .data = (void *)&cap_last_cap, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &sixty, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "nmi_watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_X86) + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "kstack_depth_to_print", + .data = &kstack_depth_to_print, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_MMU) + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN + { + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_IA64 + { + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_COMPAT + { + .procname = "compat-log", + .data = &compat_log, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_RT_MUTEXES + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#ifdef CONFIG_KEYS + { + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif +#ifdef CONFIG_RCU_TORTURE_TEST + { + .procname = "rcutorture_runnable", + .data = &rcutorture_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_proc_update_handler, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif +#ifdef CONFIG_KMEMCHECK + { + .procname = "kmemcheck", + .data = &kmemcheck_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BLOCK + { + .procname = "blk_iopoll", + .data = &blk_iopoll_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { } +}; + +static struct ctl_table vm_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = &one_ul, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = &dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "nr_pdflush_threads", + .mode = 0444 /* read-only */, + .proc_handler = pdflush_proc_obsolete, + }, + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "hugepages_treat_as_movable", + .data = &hugepages_treat_as_movable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = hugetlb_treat_movable_handler, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, + }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_extfrag_handler, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + +#endif /* CONFIG_COMPACTION */ + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = &zero, + }, + { + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, + .extra1 = &min_percpu_pagelist_fract, + }, +#ifdef CONFIG_MMU + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, +#else + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &zone_reclaim_mode, + .maxlen = sizeof(zone_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif +#ifdef CONFIG_MMU + { + .procname = "mmap_min_addr", + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = mmap_min_addr_handler, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, +#endif +#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ + (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) + { + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = scan_unevictable_handler, + }, +#ifdef CONFIG_MEMORY_FAILURE + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { } +}; + +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) +static struct ctl_table binfmt_misc_table[] = { + { } +}; +#endif + +static struct ctl_table fs_table[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(int), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(int), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(int), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_FILE_LOCKING + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DNOTIFY + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_AIO + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif /* CONFIG_AIO */ +#ifdef CONFIG_INOTIFY_USER + { + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif +#endif + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = &zero, + .extra2 = &two, + }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .procname = "binfmt_misc", + .mode = 0555, + .child = binfmt_misc_table, + }, +#endif + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, + }, + { } +}; + +static struct ctl_table debug_table[] = { +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { } +}; + +static struct ctl_table dev_table[] = { + { } +}; + +int __init sysctl_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); + return 0; +} + +#endif /* CONFIG_SYSCTL */ + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_SYSCTL + +static int _proc_do_string(void* data, int maxlen, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + size_t len; + char __user *p; + char c; + + if (!data || !maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + if (get_user(c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) + return -EFAULT; + ((char *) data)[len] = 0; + *ppos += *lenp; + } else { + len = strlen(data); + if (len > maxlen) + len = maxlen; + + if (*ppos > len) { + *lenp = 0; + return 0; + } + + data += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + if(copy_to_user(buffer, data, len)) + return -EFAULT; + if (len < *lenp) { + if(put_user('\n', ((char __user *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + *ppos += len; + } + return 0; +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, + buffer, lenp, ppos); +} + +static size_t proc_skip_spaces(char **buf) +{ + size_t ret; + char *tmp = skip_spaces(*buf); + ret = tmp - *buf; + *buf = tmp; + return ret; +} + +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + +#define TMPBUFLEN 22 +/** + * proc_get_long - reads an ASCII formatted integer from a user buffer + * + * @buf: a kernel buffer + * @size: size of the kernel buffer + * @val: this is where the number will be stored + * @neg: set to %TRUE if number is negative + * @perm_tr: a vector which contains the allowed trailers + * @perm_tr_len: size of the perm_tr vector + * @tr: pointer to store the trailer character + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes read. If @tr is non-NULL and a trailing + * character exists (size is non-zero after returning from this + * function), @tr is updated with the trailing character. + */ +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + *val = simple_strtoul(p, &p, 0); + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; + + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +/** + * proc_put_long - converts an integer to a decimal ASCII formatted string + * + * @buf: the user buffer + * @size: the size of the user buffer + * @val: the integer to be converted + * @neg: sign of the number, %TRUE for negative + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes written. + */ +static int proc_put_long(void __user **buf, size_t *size, unsigned long val, + bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + if (copy_to_user(*buf, tmp, len)) + return -EFAULT; + *size -= len; + *buf += len; + return 0; +} +#undef TMPBUFLEN + +static int proc_put_char(void __user **buf, size_t *size, char c) +{ + if (*size) { + char __user **buffer = (char __user **)buf; + if (put_user(c, *buffer)) + return -EFAULT; + (*size)--, (*buffer)++; + *buf = *buffer; + } + return 0; +} + +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -*lvalp : *lvalp; + } else { + int val = *valp; + if (val < 0) { + *negp = true; + *lvalp = (unsigned long)-val; + } else { + *negp = false; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; + +static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + int *i, vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + + for (; left && vleft--; i++, first=0) { + unsigned long lval; + bool neg; + + if (write) { + left -= proc_skip_spaces(&kbuf); + + if (!left) + break; + err = proc_get_long(&kbuf, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (conv(&neg, &lval, i, 1, data)) { + err = -EINVAL; + break; + } + } else { + if (conv(&neg, &lval, i, 0, data)) { + err = -EINVAL; + break; + } + if (!first) + err = proc_put_char(&buffer, &left, '\t'); + if (err) + break; + err = proc_put_long(&buffer, &left, lval, neg); + if (err) + break; + } + } + + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err && left) + left -= proc_skip_spaces(&kbuf); +free: + if (write) { + free_page(page); + if (first) + return err ? : -EINVAL; + } + *lenp -= left; + *ppos += *lenp; + return err; +} + +static int do_proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + NULL,NULL); +} + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i, LOCKDEP_STILL_OK); + } + } + + return err; +} + +#ifdef CONFIG_PRINTK +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + struct do_proc_dointvec_minmax_conv_param *param = data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + int val = *valp; + if (val < 0) { + *negp = true; + *lvalp = (unsigned long)-val; + } else { + *negp = false; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_minmax_conv, ¶m); +} + +static void validate_coredump_safety(void) +{ +#ifdef CONFIG_COREDUMP + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + printk(KERN_WARNING "Unsafe core_pattern used with "\ + "suid_dumpable=2. Pipe handler or fully qualified "\ + "core dump path required.\n"); + } +#endif +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + +#ifdef CONFIG_COREDUMP +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} +#endif + +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + unsigned long *i, *min, *max; + int vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; + + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned long *) data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; + + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + + for (; left && vleft--; i++, first = 0) { + unsigned long val; + + if (write) { + bool neg; + + left -= proc_skip_spaces(&kbuf); + + err = proc_get_long(&kbuf, &left, &val, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (neg) + continue; + if ((min && val < *min) || (max && val > *max)) + continue; + *i = val; + } else { + val = convdiv * (*i) / convmul; + if (!first) + err = proc_put_char(&buffer, &left, '\t'); + err = proc_put_long(&buffer, &left, val, false); + if (err) + break; + } + } + + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&kbuf); +free: + if (write) { + free_page(page); + if (first) + return err ? : -EINVAL; + } + *lenp -= left; + *ppos += *lenp; + return err; +} + +static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + buffer, lenp, ppos, convmul, convdiv); +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, + lenp, ppos, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) + return 1; + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + + if (jif > INT_MAX) + return 1; + *valp = (int)jif; + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_msecs(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_conv, NULL); +} + +static int proc_do_cad_pid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_vnr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + +/** + * proc_do_large_bitmap - read/write from/to a large bitmap + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * The bitmap is stored at table->data and the bitmap length (in bits) + * in table->maxlen. + * + * We use a range comma separated format (e.g. 1,3-4,10-10) so that + * large bitmaps may be represented in a compact manner. Writing into + * the file will clear the bitmap then update it with the given input. + * + * Returns 0 on success. + */ +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + unsigned long page = 0; + char *kbuf; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + free_page(page); + return -EFAULT; + } + kbuf[left] = 0; + + tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + GFP_KERNEL); + if (!tmp_bitmap) { + free_page(page); + return -ENOMEM; + } + proc_skip_char(&kbuf, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + + err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + kbuf++; + left--; + } + + if (c == '-') { + err = proc_get_long(&kbuf, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + kbuf++; + left--; + } + } + + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); + first = 0; + proc_skip_char(&kbuf, &left, '\n'); + } + free_page(page); + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) { + err = proc_put_char(&buffer, &left, ','); + if (err) + break; + } + err = proc_put_long(&buffer, &left, bit_a, false); + if (err) + break; + if (bit_a != bit_b) { + err = proc_put_char(&buffer, &left, '-'); + if (err) + break; + err = proc_put_long(&buffer, &left, bit_b, false); + if (err) + break; + } + + first = 0; bit_b++; + } + if (!err) + err = proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); + } + kfree(tmp_bitmap); + *lenp -= left; + *ppos += *lenp; + return 0; + } else { + kfree(tmp_bitmap); + return err; + } +} + +#else /* CONFIG_PROC_SYSCTL */ + +int proc_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + + +#endif /* CONFIG_PROC_SYSCTL */ + +/* + * No sense putting this after each symbol definition, twice, + * exception granted :-) + */ +EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_dointvec_jiffies); +EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); +EXPORT_SYMBOL(proc_dostring); +EXPORT_SYMBOL(proc_doulongvec_minmax); +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); |