diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bf07b6f..a8d902f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -594,6 +594,23 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_COOPREALTIME + bool "UBC fairshare coop scheduler support" + default y + depends on HIGH_RES_TIMERS + help + Introduces a new scheduling class faircoop, which uses + a explicit deadline information received from the user application + to reduce scheduling latency to almost zero + +config SCHED_COOP_NANOSLEEP + bool "UBC fairshare super accurate nanosleep" + default y + depends on SCHED_COOPREALTIME + help + Converts nanosleep calls to partial coop poll calls, reducing + scheduling latency to almost zero + source "kernel/Kconfig.preempt" config X86_UP_APIC diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff556..ad5a20f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -284,7 +284,7 @@ ENTRY(sys_call_table) .long sys_mq_getsetattr .long sys_kexec_load .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ + .long sys_coop_poll /* 285 */ .long sys_add_key .long sys_request_key .long sys_keyctl diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b..96f73e8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -109,6 +109,8 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); } +EXPORT_SYMBOL(cpa_flush_all); + static void __cpa_flush_range(void *arg) { /* diff --git a/fs/proc/array.c b/fs/proc/array.c index 797d775..45d1c3d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -335,7 +335,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, /* * Use precise platform statistics if available: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#if defined (CONFIG_VIRT_CPU_ACCOUNTING) static cputime_t task_utime(struct task_struct *p) { return p->utime; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 7e277f2..0bde186 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -904,4 +904,11 @@ void __init proc_misc_init(void) #ifdef CONFIG_MAGIC_SYSRQ proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); #endif + +#if defined (CONFIG_SCHED_COOPREALTIME) + proc_create("bvtstat",S_IRUGO , NULL, &proc_bvtstat_operations); + proc_create("coopstat",S_IRUGO, NULL, &proc_coopstat_operations); +#endif + + } diff --git a/fs/select.c b/fs/select.c index da0e882..6201459 100644 --- a/fs/select.c +++ b/fs/select.c @@ -25,6 +25,10 @@ #include #include +#if defined(CONFIG_SCHED_COOPREALTIME) +#include +#endif + #include struct poll_table_page { @@ -261,6 +265,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) wait = NULL; if (retval || !*timeout || signal_pending(current)) break; + if (table.error) { retval = table.error; break; @@ -288,6 +293,107 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) return retval; } +#if defined(CONFIG_SCHED_COOPREALTIME) +int high_res_select(int n, fd_set_bits *fds, ktime_t *timeout) +{ + struct poll_wqueues table; + poll_table *wait; + int retval, i; + struct timespec remaining; + int ret = 1; + rcu_read_lock(); + retval = max_select_fd(n, fds); + rcu_read_unlock(); + + if (retval < 0) + return retval; + n = retval; + + poll_initwait(&table); + wait = &table.pt; + if (timeout && !timeout->tv64) + wait = NULL; + retval = 0; + for (;;) { + unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + + set_current_state(TASK_INTERRUPTIBLE); + + inp = fds->in; outp = fds->out; exp = fds->ex; + rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; + + for (i = 0; i < n; ++rinp, ++routp, ++rexp) { + unsigned long in, out, ex, all_bits, bit = 1, mask, j; + unsigned long res_in = 0, res_out = 0, res_ex = 0; + const struct file_operations *f_op = NULL; + struct file *file = NULL; + + in = *inp++; out = *outp++; ex = *exp++; + all_bits = in | out | ex; + if (all_bits == 0) { + i += __NFDBITS; + continue; + } + + for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { + int fput_needed; + if (i >= n) + break; + if (!(bit & all_bits)) + continue; + file = fget_light(i, &fput_needed); + if (file) { + f_op = file->f_op; + mask = DEFAULT_POLLMASK; + if (f_op && f_op->poll) + mask = (*f_op->poll)(file, retval ? NULL : wait); + fput_light(file, fput_needed); + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + } + } + cond_resched(); + } + if (res_in) + *rinp = res_in; + if (res_out) + *routp = res_out; + if (res_ex) + *rexp = res_ex; + } + wait = NULL; + if (retval || (timeout && !timeout->tv64) || signal_pending(current) || + (current->cf.bvt_t.pseudo_sleep == 2)) { + break; + } + + if(table.error) { + retval = table.error; + break; + } + + if (!timeout || timeout->tv64 < 0) + schedule(); + else + *timeout = schedule_timeout_hr(*timeout); + } + __set_current_state(TASK_RUNNING); + + poll_freewait(&table); + + return retval; +} +#endif + /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return @@ -345,8 +451,9 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || - (ret = get_fd_set(n, exp, fds.ex))) + (ret = get_fd_set(n, exp, fds.ex))) { goto out; + } zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); @@ -364,9 +471,10 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || - set_fd_set(n, exp, fds.res_ex)) + set_fd_set(n, exp, fds.res_ex)) { + printk("%d Error copying out fds\n",current->pid); ret = -EFAULT; - + } out: if (bits != stack_fds) kfree(bits); @@ -398,7 +506,7 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, } ret = core_sys_select(n, inp, outp, exp, &timeout); - + if (tvp) { struct timeval rtv; diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h index 8317d94..a54b77c 100644 --- a/include/asm-x86/unistd_32.h +++ b/include/asm-x86/unistd_32.h @@ -290,6 +290,9 @@ #define __NR_mq_getsetattr (__NR_mq_open+5) #define __NR_kexec_load 283 #define __NR_waitid 284 +#if defined(CONFIG_SCHED_COOPREALTIME) +#define __NR_coop_poll 285 +#endif /* #define __NR_sys_setaltroot 285 */ #define __NR_add_key 286 #define __NR_request_key 287 diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h index fe26e36..6a67d9a 100644 --- a/include/asm-x86/unistd_64.h +++ b/include/asm-x86/unistd_64.h @@ -639,6 +639,8 @@ __SYSCALL(__NR_fallocate, sys_fallocate) __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) #define __NR_timerfd_gettime 287 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) +#define __NR_coop_poll 288 +__SYSCALL(__NR_coop_poll, sys_coop_poll) #ifndef __NO_STUBS diff --git a/include/linux/coop_fairshare_sched.h b/include/linux/coop_fairshare_sched.h new file mode 100644 index 0000000..db20f23 --- /dev/null +++ b/include/linux/coop_fairshare_sched.h @@ -0,0 +1,32 @@ +#ifndef _LINUX_COOP_FAIRSHARE_SCHED_H +#define _LINUX_COOP_FAIRSHARE_SCHED_H + +#ifdef __KERNEL__ + +/* This header defines all the structures that are + * common for implementing both coop_poll and fairshare + * scheduling. Specific declarations for coop_poll are in + * linux/coop_poll.h and specific declarations for fairshare + * are in linux/bvt_schedule.h + */ + +#include +#include +#include + +/* this is the per task structure embedded in task_struct */ +struct coop_fairshare_struct +{ + struct bvt_domain *bvt_dom; + struct fairshare_sched_param *task_sched_param; + struct coop_struct coop_t; + struct bvt_struct bvt_t; + int dom_id; + struct list_head bvt_procs; +}; + +extern struct bvt_domain bvt_domains[]; + +#endif /* __KERNEL__ */ + +#endif diff --git a/include/linux/coop_poll.h b/include/linux/coop_poll.h new file mode 100644 index 0000000..94161bc --- /dev/null +++ b/include/linux/coop_poll.h @@ -0,0 +1,249 @@ +#ifndef _LINUX_COOP_POLL_H +#define _LINUX_COOP_POLL_H + +#include +#include +#include + +/* The following structures define the param struct for the coop_poll + * system call. This definition must also be exported in the + * userspace. + */ + +enum COOP_POLL_STATUS + { + /* Task was well behaved */ + COOP_POLL_OK, + /* Task either was best-effort, or became best effort by specifying DOM_LEAVE */ + COOP_POLL_BEST_EFFORT, + /* Task was demoted for sleeping */ + COOP_POLL_SLEEP, + /* Task was demoted for running too far beyond t_deadline */ + COOP_POLL_LATE_YIELD, + /* Task was demoted because it's virtual time was too far + * ahead of others. This should only happen if a task is + * burning too much time within coop timeouts (too many + * deadlines and/or too much time per deadline). + */ + COOP_POLL_VIRTUAL_TIME + }; + +struct coop_param_t +{ + /* deadline for the timeout event */ + struct timeval t_deadline; + + /* deadline for the asap event */ + struct timeval t_asap; + + /* asap priotity */ + int p_asap; + + /* whether asap exisis */ + int have_asap; +}; + +struct coop_fds_t +{ + fd_set *inp; + fd_set *outp; + fd_set *exp; +}; + +struct coop_syscall_t +{ + int dom_id; + struct coop_param_t __user *i_param; + struct coop_param_t __user *o_param; + int n; /* max fd in fds */ + struct coop_fds_t __user *fds; + enum COOP_POLL_STATUS *status; +}; + +/* called from kernel/exit.c */ +void kill_coop_task(struct task_struct*); + +/* called from init/main.c:start_kernel() */ +void coop_init(int); + +/* called from kernel/fork.c:copy_process() */ +void coop_proc_init(struct task_struct*); + +/* Declarations for coop_poll syscall. + * This file specifies common declarations and + * function call for coop_poll syscall. + * + * Anirban Sinha, Charles Krasic {anirbans,krasic}@cs.ubc.ca + * Ashvin Goel {ashvin@eece.utoronto.edu} + * Nov 2006 - Aug 2007. + * Dec 22 2006: We use completions to achieve + * the sleeping and waiting semantics for non-policing coop_poll. + * Jun 1st, 2007: Organizational changes to seperate out the + * policing and non-policing versions of coop_poll. + * ************************************************************* + * This header contains declarations that are used + * in the other parts of the kernel code for incorporating + * coop_poll related changes. It also includes declarations + * that must be ported to the user space for using the + * coop_poll syscall. + * A deliberate attempt has been made to touch the + * other parts of the kernel as minimally as possible. + * Comments specify which declarations are used in which + * other parts of the kernel. + * ************************************************************* + */ + +/* The following data structures define the data + * field of the heap nodes. They are specific to + * the different kinds of events we want to keep + * track of. + */ + +struct __deadline_node { + struct timespec rank; /* to establish FIFO order */ + struct timeval t_deadline; /* deadline time, + * this is also the key of the heap node */ +}; + +struct __asap_node { + struct timespec rank; /* for FIFO order */ + struct timeval t_asap; /* asap deadline */ + int priority; /* asap priority */ +}; + +typedef struct __deadline_node deadline_info; +typedef struct __asap_node asap_info; + +/* following definitions are relevant to the heap insertion and + * deletion functions */ +#define COOP_TIMEOUT_HEAP 1 +#define COOP_ASAP_HEAP 2 +#define COOP_SLEEP_HEAP 3 + +typedef heap_t* coop_heap_asap; +typedef heap_t* coop_heap_deadline; + +/* This is the main per CPU coop_poll data structure + * The most important data structure is the + * priority queue for the asap and the deadline events. + * This is embedded in the struct bvtqueue which is + * in itself embedded in per CPU runqueue. + * Note that in this implementation, it is no longer + * in itself a per CPU variable. + */ + +struct _coop_queue +{ + + /* the priority queues for the two kinds of + * events, asap and deadline + */ + + coop_heap_asap heap_asap; + coop_heap_deadline heap_deadline; + + /* This is the heap of all the processes sleeping on the coop + * sleep. */ + + coop_heap_deadline heap_coop_sleep; + + /* The following is used not only for + * stats but also to determine the rank + * of a asap and timeout node in the heap. + * It is incremented per coop_poll syscall + * and is never decremented. + */ + + unsigned long num_yields; + unsigned long num_coop_calls; + unsigned long num_rendezvous0; + unsigned long num_rendezvous1; + unsigned long num_rendezvous2; + + unsigned long num_io_wakeups[8]; + + unsigned long num_rendezvous_wakee; + unsigned long num_rendezvous_waker; + +}; + +typedef struct _coop_queue coop_queue; + +/* coop_struct: This is the per process coop data structure */ + +struct coop_struct +{ + /* These are backpointers to its corresponding + * heap nodes of the ASAP and deadline heaps to + * which this process was a member. + */ + + heap_node *coop_asap_heap_node; + heap_node *coop_deadline_heap_node; + heap_node *coop_sleep_deadline_node; + heap_node *coop_deadline_global_heap_node; + heap_node *coop_sleep_deadline_global_heap_node; + + /* The following boolean value is used to distinguish a coop_poll + * task from a non-coop poll task. + * This flag should never be set by any non-coop poll task for + * its own purposes.' + */ + + int is_coop_task; + + /* The following data structures keep track of the deadline and + * asap parameters + */ + + deadline_info dead_p; + asap_info asap_p; + /* Save the task's current cpu mask onto this*/ + cpumask_t prev_mask; + /* is the task well behaved ? */ + int is_well_behaved; + /* deadline of the coop task saved here */ + struct timespec deadline; + +}; + +/* These two macros set and get the status of the asap event + * They could also be exported to the userspace for a cleaner interface + * though not strictly necessary. + */ + +#define SET_HAVE_NO_ASAP(coop_param) (coop_param).have_asap = 0; +#define SET_HAVE_ASAP(coop_param) (coop_param).have_asap = 1; +#define GET_HAVE_ASAP(coop_param) (coop_param).have_asap +#define GET_HAVE_ASAP_YIELD(coop_param) ((coop_param).have_asap == 2) + +#define is_coop_realtime(tsk) (tsk)->cf.coop_t.is_coop_task +#define set_coop_task(tsk) (tsk)->cf.coop_t.is_coop_task = 1; +#define clear_coop_task(tsk) (tsk)->cf.coop_t.is_coop_task = 0; + +/* used in fs/proc/proc_misc.c */ +extern struct file_operations proc_coopstat_operations; +void find_nearest_global_deadline(struct task_struct **w_dead); +void find_nearest_global_asap(struct task_struct **w_asap); +void find_nearest_global_deadline_overall(struct task_struct **overall); +void remove_task_from_coop_queue(struct task_struct*,coop_queue*,int); +coop_queue* cpu_cq(int,int); + +int find_coop_period(struct task_struct *next, + struct task_struct **next_coop, + struct timespec* coop_prd); +void choose_next_coop(struct task_struct** target_task, int dom_id); +void test_remove_task_from_coop_bvt_queues(struct task_struct *tsk, + coop_queue *cq); +void find_next_nearest_global_deadlines(struct task_struct *next_earliest_deadline_task,struct task_struct **next_to_next_earliest_deadline_task); +void find_second_nearest_global_deadline_overall(struct task_struct **overall); +void set_normalized_timeval(struct timeval *tv, time_t sec, long usec); +gboolean coop_poll_timeout_gt(heap_key_t a, heap_key_t b); +long insert_task_into_timeout_queue(struct timeval*,coop_queue*, + struct task_struct*,int,unsigned int); +long insert_task_into_sleep_queue(struct timeval*,coop_queue*, + struct task_struct*,int); +void set_tsk_as_temp_coop(struct task_struct*); +void pin_coop_task(struct task_struct*); +void unpin_coop_task(struct task_struct*); +#endif /* _LINUX_COOP_POLL_H */ diff --git a/include/linux/coop_sched_domains.h b/include/linux/coop_sched_domains.h new file mode 100644 index 0000000..2261b43 --- /dev/null +++ b/include/linux/coop_sched_domains.h @@ -0,0 +1,101 @@ +#ifndef _LINUX_COOP_SCHED_DOMAINS_H +#define _LINUX_COOP_SCHED_DOMAINS_H + +/* This header defines the scheduling domains related to the + * cooperative scheduling regime. + * Also defined the sched_param struct. + * In general, we have two scheduling domains - one which + * incorporates all the cooperative real time processes. The + * other incorporates all the best effort processes. All best + * effort processes are scheduled by the fair share scheduler + * based on their individual *virtual time* whereas all the real + * time processes are scheduled as a group and the group as a whole + * has a single virtual time. + * + * This is a joint work of all the following authors: + * Anirban Sinha, anirbans@cs.ubc.ca + * Charles Krasic, krasic@cs.ubc.ca + * Ashvin Goel, ashvin@eecg.toronto.edu + */ + +#include +#include +#include + +enum domain_type +{ + DOM_REALTIME_COOP0, + DOM_REALTIME_COOP1, + DOM_REALTIME_COOP2, + DOM_REALTIME_COOP3, + DOM_REALTIME_COOP4, + DOM_REALTIME_COOP5, + DOM_REALTIME_COOP6, + DOM_REALTIME_COOP7, + DOM_REALTIME_COOP8, + DOM_REALTIME_COOP9, + DOM_REALTIME_COOP10, + DOM_REALTIME_COOP11, + DOM_REALTIME_COOP12, + DOM_REALTIME_COOP13, + DOM_REALTIME_COOP14, + DOM_REALTIME_TEMP, + DOM_BEST_EFFORT, + DOM_MAX_TYPE, + DOM_LEAVE, + INFORM_ABT_SLEEP, +}; + +#define NR_COOP_DOMAINS (DOM_MAX_TYPE - 1) + +/* the following structure keeps track of the scheduling parameters + * in our scheduling regime. + */ + +struct fairshare_sched_param { + enum domain_type dom_type; + struct timespec bvt_virtual_time; + struct timespec bvt_actual_time; + struct timespec insertion_ts; + heap_node* bheap_ptr; + struct timespec fudge; /* Amount of fudge it can burn at a go before waiting for replenishment */ + int weight; /* Task/ domain weight */ +}; + + +/* every task in our scheduling regime belongs to one + * of the domains specified above + * Each of the domains will have some per cpu data and some + * global data. The current implementation is kept open + * so that we can enhance it in the future. + * This structure is a member of each per cpu runqueue. + */ +struct bvt_domain { + + enum domain_type dom_type; + /* number of kernel tasks in this domain. + * in SMP, this figure is per cpu basis. + */ + long num_tasks; + /* Sum of weights of all the tasks in this domain */ + unsigned long num_weights; + /* This is per domain scheduling parameters currently, this is + * only relevant for real coop domains as best effort tasks + * use their private per task sched parameters + */ + struct fairshare_sched_param dom_sched_param; + + /* ... + other per domain stuff that can be added in future */ +}; + +#define for_each_available_coop_domain(dom_id) \ + for((dom_id)=0; (dom_id) < (NR_COOP_DOMAINS); (dom_id)++) + +#define task_domain(tsk) (tsk)->cf.dom_id + +#endif + + + + + diff --git a/include/linux/glib.h b/include/linux/glib.h new file mode 100644 index 0000000..ac419f5 --- /dev/null +++ b/include/linux/glib.h @@ -0,0 +1,119 @@ +#ifndef _LINUX_GLIB_H +#define _LINUX_GLIB_H + +/* This is a set of workarounds that can be + * used to port glib type definitions into kernel for + * codes already using glib definitions + */ + + +#include +#include +#include + +#if !defined(gboolean) + +#define gboolean bool + +#if !defined(boolean) +#define boolean bool +#endif + + +#endif + +typedef void* gpointer; +typedef int gint; +typedef const void *gconstpointer; +typedef char gchar; +typedef unsigned char guchar; +typedef unsigned int guint; +typedef short gshort; +typedef unsigned short gushort; +typedef long glong; +typedef unsigned long gulong; +typedef signed char gint8; +typedef unsigned char guint8; +typedef signed short gint16; +typedef unsigned short guint16; +typedef signed int gint32; +typedef unsigned int guint32; +typedef float gfloat; +typedef double gdouble; +typedef unsigned int gsize; +typedef signed int gssize; + + +#define g_assert(expr) \ + if(unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr,__FILE__,__FUNCTION__,__LINE__); \ + dump_stack(); \ + BUG(); \ + } + + +#define g_free(pointer) kfree(pointer); + +#define g_new0_atomic(struct_type, n_structs) \ + kcalloc((n_structs), sizeof(struct_type), GFP_ATOMIC); + +#define g_new_atomic(n_bytes) \ + kmalloc(n_bytes, GFP_ATOMIC); + +/* g_realloc: + * This function allocates a contigious chunk of kernel memory + * pages of given size. Then copies the contents of old memory to the new location. + * @mem: pointer to old memory location; if null, the call is equivalent to + * a call to kmalloc with memory chunk = n_bytes. It then returns the pointer + * to the newly allocated location + * @old_n_bytes: number of bytes allocated to the old pointer. Is there a way to get + * this information from within the kernel? + * @n_bytes: new number of bytes to allocate. if this is zero, it frees the + * memory location pointed to by mem and returns NULL. + * After all this is done, it frees the old reserved memory chunk + * Returns pointer to the newly allocated memory location + * on all other cases. + * =================================================================================== + * Warning: For reallocating a large chunk of memory, use of this function + * is not adviced. + * + */ + +static inline gpointer g_realloc (gpointer mem, gulong old_n_bytes, gulong n_bytes) +{ + void *new_mem; + + /* if mem is null, allocate new chunk using kmalloc */ + + if (!mem && n_bytes) { + new_mem=g_new_atomic(n_bytes); + return new_mem; + } + + if(mem && (!n_bytes)) { + kfree(mem); + return 0; + } + + if(!mem && !n_bytes) return NULL; + + new_mem = g_new_atomic(n_bytes); + + if (unlikely(!new_mem)) { + printk(KERN_ERR "%s, %s, line:%d : unable to alloc buffer\n",__FILE__, __FUNCTION__, __LINE__); + dump_stack(); + return NULL; + } + + if (old_n_bytes < n_bytes ) + memcpy(new_mem,mem,(size_t) old_n_bytes); + else + memcpy(new_mem,mem,(size_t) n_bytes); + + kfree(mem); + return new_mem; +} + +#endif + diff --git a/include/linux/heap.h b/include/linux/heap.h new file mode 100644 index 0000000..85e4d12 --- /dev/null +++ b/include/linux/heap.h @@ -0,0 +1,63 @@ +#ifndef _LINUX_HEAP_H +#define _LINUX_HEAP_H + + +#include + +typedef gpointer heap_key_t; +typedef gpointer heap_data_t; + +/* Note, it is required that these only be called on a non-empty + * heap. + */ + +#define heap_is_empty(heap) ((heap)->size == 0) +#define heap_min_data(heap) ((heap)->nodes[1]->data) +#define heap_min_key(heap) ((heap)->nodes[1]->key) + +/* The function that compares the key values of the heap nodes + * The actual implementation and the polarity + * is left for the implementor of the function + */ + +typedef gboolean (* heap_key_comp_func)(heap_key_t a, heap_key_t b); +/* structs are private, but we need them for the macros above */ + +struct _heap_node { + heap_key_t key; + heap_data_t data; + gint index; /* backpointer for delete */ +}; + +typedef struct _heap_node heap_node; + +/* The main heap data structure + * Note that we do not use any locks to protect heap. + * The reason is that if needed, we can embed the heap in + * a larger data structure and use locks to protect the + * larger data structure as a whole. + */ + +struct _heap_s { + heap_key_comp_func key_comp; + gint size; + gint capacity; + heap_node **nodes; /* an array of pointers to heap nodes */ +}; + +typedef struct _heap_s heap_t; + +/* All the operations possible on heap */ + +extern heap_t *create_heap (heap_key_comp_func key_gt, int initial_capacity); + +extern int heap_ensure_capacity(heap_t *heap, int capacity); + +extern heap_node *heap_insertt (heap_t *heap, heap_key_t key, heap_data_t data); +extern heap_node *heap_insert_nogrow (heap_t *heap, heap_key_t key, heap_data_t data); + +extern heap_data_t heap_delete_min (heap_t *heap); +extern void heap_delete (heap_t *heap, heap_node *node); +extern void destroy_heap (heap_t *heap); +extern void print_heap (heap_t *heap); +#endif /* _LINUX_HEAP_H */ diff --git a/include/linux/poll.h b/include/linux/poll.h index ef45382..af754b4 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -115,6 +115,7 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset) #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) extern int do_select(int n, fd_set_bits *fds, s64 *timeout); +extern int high_res_select(int n,fd_set_bits *fds, ktime_t *timeout); extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, s64 *timeout); extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, diff --git a/include/linux/print_debug.h b/include/linux/print_debug.h new file mode 100644 index 0000000..0060ce2 --- /dev/null +++ b/include/linux/print_debug.h @@ -0,0 +1,162 @@ + +/* ************************************************************* + * This file contains declarations and function definitions + * that are extracted from some + * other part of the kernel and hence needs cleanup + * ************************************************************* + */ + +#ifndef __LINUX_PRINT_DEBUG_H +#define __LINUX_PRINT_DEBUG_H + +#include +#include +#include +#include +#include + +extern struct timezone sys_tz; + +/* The following declarations are shamelessly taken from + * the declaration of the structure kernel_timestamp in fs/udf/ecma_167.h + * There has to be a global standard definition of this data type for + * use by several other subsystems in the kernel. + */ + +typedef struct +{ + uint16_t typeAndTimezone; + uint16_t year; + uint8_t month; + uint8_t day; + uint8_t hour; + uint8_t minute; + uint8_t second; + uint8_t milliseconds; + uint8_t microseconds; +} __attribute__ ((packed)) tm; + + +/* The following declarations and macros are shamelessly copied from + * fs/udf/udftime.c. + * In future, there has to be one single definition + * for a global use somewhere in the kernel. + */ + +#define EPOCH_YEAR 1970 +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +#ifndef __isleap +/* Nonzero if YEAR is a leap year (every 4 years, + except every 100th isn't, and every 400th is). */ +#define __isleap(year) \ + ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0)) +#endif + + +/* How many days come before each month (0-12). */ +static const unsigned short int __mon_yday[2][13] = +{ + /* Normal years. */ + { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, + /* Leap years. */ + { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } +}; + + +static void gmtime(tm *dest, struct timespec *ts) +{ + + long int days, rem, y; + const unsigned short int *ip; + int16_t offset; + + offset = -sys_tz.tz_minuteswest; + + if (!dest) + return; + + dest->typeAndTimezone = 0x1000 | (offset & 0x0FFF); + + ts->tv_sec += offset * 60; + days = ts->tv_sec / SECS_PER_DAY; + rem = ts->tv_sec % SECS_PER_DAY; + dest->hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + dest->minute = rem / 60; + dest->second = rem % 60; + y = EPOCH_YEAR; + +#define DIV(a,b) ((a) / (b) - ((a) % (b) < 0)) +#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400)) + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) + { + long int yg = y + days / 365 - (days % 365 < 0); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= ((yg - y) * 365 + + LEAPS_THRU_END_OF (yg - 1) + - LEAPS_THRU_END_OF (y - 1)); + y = yg; + } + dest->year = y; + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < (long int) ip[y]; --y) + continue; + days -= ip[y]; + dest->month = y + 1; + dest->day = days + 1; + + dest->milliseconds = ts->tv_nsec / 1000000; + dest->microseconds = (ts->tv_nsec / 1000 - dest->milliseconds * 1000); +} + +#define print_macro(flag, fmt,arg...) do { \ + printk(KERN_NOTICE "%s %u " fmt "\n",flag, \ + current->pid, ##arg); \ + }while(0); + +/* print_debug: the function to print debugging messages onto the + * kernel ring buffer . + * note that this function is critical-section and interrupt handler safe. + * as it allocates memory using GFP_ATOMIC flag. + */ + + +static void print_debug(char* id, const char* fmt, ...) +{ + struct timeval tv_now; + struct timespec ts_now; + tm t; + va_list args; + + char *buff = kmalloc(128, GFP_ATOMIC); + + va_start(args, fmt); + vsnprintf(buff, 128, fmt, args); + va_end(args); + + preempt_disable(); + + do_gettimeofday(&tv_now); + + set_normalized_timespec(&ts_now,tv_now.tv_sec, tv_now.tv_usec * NSEC_PER_USEC); + + gmtime(&t, &ts_now); + + print_macro(id, "%02u:%02u:%02u.%06u.%06u %s", + t.hour, + t.minute, + t.second, + t.milliseconds, + t.microseconds, + buff); + + preempt_enable_no_resched(); + + kfree(buff); +} + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f84..5291298 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1,6 +1,10 @@ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H +#if defined(CONFIG_SCHED_COOPREALTIME) +#include +#endif + /* * cloning flags: */ @@ -324,6 +328,7 @@ extern char __sched_text_start[], __sched_text_end[]; extern int in_sched_functions(unsigned long addr); #define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern ktime_t schedule_timeout_hr(ktime_t timeout); extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); @@ -1005,6 +1010,8 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif + + }; struct sched_rt_entity { @@ -1303,6 +1310,11 @@ struct task_struct { int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif +#if defined (CONFIG_SCHED_COOPREALTIME) + /* The fairshare struct to store pvt info for this scheduler*/ + struct coop_fairshare_struct cf; +#endif + }; /* diff --git a/include/linux/sched_fairshare.h b/include/linux/sched_fairshare.h new file mode 100644 index 0000000..b391e2e --- /dev/null +++ b/include/linux/sched_fairshare.h @@ -0,0 +1,202 @@ +#ifndef _LINUX_BVT_SCHEDULE_H +#define _LINUX_BVT_SCHEDULE_H + +#include +#include +#include + +#ifdef CONFIG_HIGH_RES_TIMERS +#include +#else +#include +#endif + +#include +#include + +#define BVT_MIN_TIMESLICE 100 /* in usec */ +extern struct timespec ts_bvt_min_timeslice; +extern volatile suseconds_t bvt_sched_granularity; +extern volatile unsigned int bvt_sched_tracing; +extern volatile suseconds_t bvt_sched_unfthreshold; +#define BVT_LOG_BUF_SIZE 1024 +#define CONFIG_BVTPRD 20000 + +/* Enable debug messages */ +//#define DEBUG_FAIRCOOP + +struct debug_entry +{ +int isTargetSet; +struct timespec target_virtual_time; +struct timespec curr_timeslice; +struct timespec curr_time; +pid_t pid; +}; + + +/* Fudge constants */ +#define COOP_DEAD_FUDGE_DEFAULT 20000000 /* in nsecs, set at 20 ms */ +#define COOP_DEAD_FUDGE_INC 100000 /* in nsecs, 100 microsecs */ +#define COOP_DEAD_FUDGE_MAX 500000000 /* in nsecs, 500 ms */ +/* Slack constants */ +#define COOP_DEAD_SLACK 1000000 /* in nsecs , set 1 ms */ + +#define COOP_EMERGENCY_TIMESLICE 500000 /*in nsecs, 500 microsecs */ +#define COOP_IO_LAT 100000 /*in nsecs, set 250 usecs */ + +/* This is the main per cpu bvt queue structure + * This is a member of struct runqueue + * needed by sched.c + * Synchronization issues: The spinlock that protects the + * runqueue also protects the following data structure. + * Any reference modification to bvtqueue must be done within + * the corresponding runqueue spinlocks held. + * There are two pairs of inline functions defined in linux/sched.h + * that locks the runqueue and returns a reference to the corresponding bvtqueue. + * They are: + * 1. inline bvtqueue* get_task_bq_locked(struct task_struct *tsk, + * unsigned long *flags); + * 2. inline void put_task_bq_locked(struct bvt_queue *bq, unsigned long flags); + * 3. inline bvtqueue* get_cpu_bq_locked(int cpu, unsigned long *flags); + * 4. inline void put_cpu_bq_locked(int cpu, unsigned long flags); + * 5. inline void get_task_bq(struct task_struct *p); + * All accesses must be done through these functions. + */ +struct bvtqueue +{ + heap_t* bvt_heap; + heap_t* global_coop_deadline_heap; /* Per cpu global heap for storing all the deadline events */ + heap_t* global_coop_sleep_heap; /* Per cpu global heap for storing deadlines for all sleeping coop tasks*/ + /* The pointer to the running bvt task */ + struct task_struct *running_bvt_task; + + /* the current scheduling timeslice + * in timespec + */ + struct timespec curr_bvt_period; + +#ifdef CONFIG_HIGH_RES_TIMERS + /* The bvt high resolution timer */ + struct hrtimer bvt_timer; +#else + struct timer_list bvt_timer; +#endif + + /* Per cpu debug log */ + struct debug_entry bvt_debug_buffer[BVT_LOG_BUF_SIZE]; + unsigned int bvtLogCount; + + /* this is bad design: we only have global coop domains + * The best effort domain parameters are per task basis. + * yet, we have one slot for bvt domain. + */ + struct bvt_domain bvt_domains[DOM_MAX_TYPE]; + + /* the coop queues also are now in the runqueue + * instead of being a per CPU variable themselves + */ + coop_queue cq[NR_COOP_DOMAINS]; + + /* This is used to keep track of the last coop deadline and + * calculate the coop period + */ + struct timeval last_coop_deadline; + struct timespec max_virtual_time; + int isTargetSet; + bool fudged_flag; + struct timespec ts_slack; /* Slack given to coop tasks before they get policed*/ + struct timespec ts_io_lat; /* Maximum tolerable io latency for coop tasks*/ + struct timespec ts_emergency_timeslice; /* Timeslice for executing in emergency mode */ + struct timespec ts_now; /* Our scheduler's view of monotonic time */ + + gboolean rendezvous; + enum COOP_POLL_STATUS reason; + + /* Stat Variables */ + unsigned long adj; + unsigned long noadj; + unsigned long fudge; + unsigned long nofudge; + unsigned int count; + struct timespec tot_time; /* Total time = user time + system time */ + unsigned long nr_running; + unsigned long sum_weights; + + int bvt_timer_active; + #ifdef CONFIG_SMP + struct list_head bvt_list; /* Helps in iterating through all of them*/ + struct list_head *list_pos; /* Current position for the iterator */ + #endif + +}; + + +/* bvt_struct is the bvt specific declarations that + * must be incorporated into the task_struct + * used by include/linux/sched.h + */ +struct bvt_struct +{ + struct timespec bvt_timeslice_start; + struct timespec bvt_timeslice_end; + + struct fairshare_sched_param private_sched_param; + + struct task_struct *me; + int about_to_sleep; + int pseudo_sleep; + #ifdef CONFIG_SMP + struct list_head bvt_procs; + #endif + +}; + +/* Forward declaration*/ +extern const struct sched_class faircoop_sched_class; + +/* This macro returns true if a task is + * under our scheduling regime.*/ +#define is_bvt(p) (p->sched_class == &faircoop_sched_class) + +/* global bvt init function; called from init/main.c:start_kernel()*/ +void bvt_global_init(int); + +/* bvt proc initialization function called from + * kernel/fork.c:copy_process() + */ +void bvt_proc_init(struct task_struct *p); + +/* bvt proc destroy, called from kernel/exit.c: do_exit() + */ +void detach_coop_fairshare_sched(struct task_struct* tsk); + +extern int is_best_effort(struct task_struct* tsk); + +extern struct file_operations proc_bvtstat_operations; + +void insert_task_into_bvt_queue(struct bvtqueue *bq, + struct task_struct *t); +void __do_set_bvt(struct task_struct *p, int need_lock); +#if 0 +void remove_task_from_bvt_queue(struct bvtqueue *bq, + struct task_struct *p); +#endif +void do_policing(struct bvtqueue *bq, + struct task_struct *tsk); +void demote_task(struct bvtqueue *bq, + struct task_struct *tsk, + enum COOP_POLL_STATUS reason); +void init_bvt_domain(struct bvtqueue*,struct task_struct*); + +void tv_fairshare_now_adjusted(struct timeval*); +void fairshare_now(struct timespec*); + +struct bvtqueue* get_task_bq_locked(struct task_struct *tsk, unsigned long *flags); +void put_task_bq_locked(struct bvtqueue *bq, unsigned long *flags); +struct bvtqueue* get_cpu_bq_locked(int cpu, unsigned long *flags); +void put_cpu_bq_locked(int cpu, unsigned long *flags); +struct bvtqueue* cpu_bq(int cpu); +struct bvtqueue* get_task_bq(struct task_struct *tsk); +#endif /* _LINUX_BVT_SCHEDULE_H */ + diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0522f36..5cbfe1f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,9 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +#if defined(CONFIG_SCHED_COOPREALTIME) +struct coop_syscall_t; +#endif #include #include @@ -66,6 +69,9 @@ struct getcpu_cache; #include asmlinkage long sys_time(time_t __user *tloc); +#if defined(CONFIG_SCHED_COOPREALTIME) +asmlinkage long sys_coop_poll(struct coop_syscall_t __user *param); +#endif asmlinkage long sys_stime(time_t __user *tptr); asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 24141b4..aeff271 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -163,6 +163,9 @@ enum KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ +#if defined(CONFIG_SCHED_COOPREALTIME) + KERN_SCHED_TRACING=77, /*int: to control faircoop sched tracing */ +#endif }; diff --git a/init/main.c b/init/main.c index f7fb200..a7f3fba 100644 --- a/init/main.c +++ b/init/main.c @@ -534,6 +534,7 @@ void __init __weak thread_info_cache_init(void) asmlinkage void __init start_kernel(void) { char * command_line; + unsigned int cpu; extern struct kernel_param __start___param[], __stop___param[]; smp_setup_processor_id(); @@ -675,9 +676,16 @@ asmlinkage void __init start_kernel(void) cpuset_init(); taskstats_init_early(); delayacct_init(); + /* If smp is configured,our scheduler heaps will be initialized in + * smp_init*/ + #if defined(CONFIG_SCHED_COOPREALTIME) + for_each_possible_cpu(cpu) { + bvt_global_init(cpu); + coop_init(cpu); + } + #endif check_bugs(); - acpi_early_init(); /* before LAPIC and SMP init */ /* Do the rest non-__init'ed, we're now alive */ diff --git a/kernel/coop_poll.c b/kernel/coop_poll.c new file mode 100644 index 0000000..7982ae5 --- /dev/null +++ b/kernel/coop_poll.c @@ -0,0 +1,1389 @@ + /* The coop_poll() system call interface for cooparative + * soft real time user processes. + * This file contains the coop_poll code that implements policing + * as a part of the combined coop_poll, fairshare heuristics. + * The system call number is 285 + * The original entry in #285 was sys_ni_syscall which is + * a placeholder for non-implemented system calls. + * + * Current Author:Mayukh Saubhasik mayukh@cs.ubc.ca + * Original Code by Anirban Sinha, anirbans@cs.ubc.ca + * Other contributors: Charles Krasic, Ashvin Goel + * krasic@cs.ubc.ca, ashvin@eecg.toronto.edu + * + * May 28, 2007: New non-sleeping version + * introduced. It is hoped that this version of coop_poll() will be + * more conducive towards the policing mechanism through fairshare scheduling. + * + * TODO: integrate coop_poll() with kernel side of epoll(). + * + * A Note about timekeeping: + * kernel uses all different forms and notions about what monotonic "now" is. + * Please check kernel/bvt_schedule.c: fairshare_now() function + * for explanations + * However, our user level coop processes uses do_gettimeofday() + * system call to register their "now" value. This is actually the wall + * clock time. Thus, all coop related + * time comparisons (and while reporting time to userspace) + * uses do_gettimeofday() call to get the value of + * "now". Unfortunately, this value is not the same as the monotonic clock + * value (see wall_to_monotonic offset timespec) that is used by the + * timer code (highres timers or otherwise). + * Hence, we are forced to use two different notions of time, + * do_gettimeofday() and fairshare_now(). The former is used to report + * and compare time values in the coop heap. The latter is used by the + * fairshare timer scheduling code. + */ + +#if defined(CONFIG_SCHED_COOPREALTIME) + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static unsigned long total_deadlines = 0; +static unsigned long earlier_deadlines = 0; + +#define __FUNC__ __func__ +#define HEAP_INIT_CAP 1024 /* initial capacity for the coop poll heaps */ + +gboolean coop_poll_asap_gt(heap_key_t a, heap_key_t b); +static inline void find_nearest_deadline(coop_queue *cq, struct task_struct **w_dead); + +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_SMP) +static __u32 curr_cpu; +#endif + +#if defined(CONFIG_SMP) +/* Has to be called with runQ lock held */ +void pin_coop_task(struct task_struct *tsk) +{ + cpumask_t mask; + cpus_clear(mask); + cpus_clear(tsk->cf.coop_t.prev_mask); + cpu_set(task_cpu(tsk), mask); + tsk->cf.coop_t.prev_mask = tsk->cpus_allowed; + tsk->cpus_allowed = mask; + tsk->rt.nr_cpus_allowed = cpus_weight(mask); +} + +/* Has to be called with runQ lock held */ +void unpin_coop_task(struct task_struct *tsk) +{ + tsk->cpus_allowed = tsk->cf.coop_t.prev_mask; + tsk->rt.nr_cpus_allowed = cpus_weight(tsk->cf.coop_t.prev_mask); +} +#endif + +inline coop_queue* cpu_cq(int cpu, int dom_id) +{ + struct bvtqueue *bq = cpu_bq(cpu); + coop_queue *cq = &(bq->cq[dom_id]); + return cq; +} + +/* task_cq_lock - lock a given coop_queue and return + * a reference to the structure.. + * Locks the corresponding runqueue of which this coop_queue + * is a member. + */ +static inline coop_queue *task_cq_lock(struct task_struct *tsk, + int dom_id, + unsigned long *flags, + struct bvtqueue **bq) +{ + coop_queue *cq; + *bq = get_task_bq_locked(tsk,flags); + cq = &((*bq)->cq[dom_id]); + return cq; +} + +/* cq_unlock: + * unlock the coop queue and enable preemption + * where applicable + */ +static inline void cq_unlock(struct bvtqueue *bq, + unsigned long *flags) +{ + put_task_bq_locked(bq, flags); +} +#define COOP_DEBUG "[COOP_POLL]" + +#define coop_print_debug(fmt,arg...) do { \ + print_debug(COOP_DEBUG,fmt,##arg); \ + }while(0); + +#if 0 +inline int is_coop_realtime(struct task_struct* p) { + + if (!p->cf.bvt_dom) return 0; + + return ((p->cf.dom_id >= DOM_REALTIME_COOP0) && + (p->cf.dom_id <= DOM_REALTIME_COOP14)); +} +#endif + +static int show_coopstat(struct seq_file *seq, void *v) +{ + int cpu, dom_id; + coop_queue *cq; + struct task_struct *node; + + seq_printf(seq,"timestamp %lu\n",jiffies); + seq_printf(seq,"total deadlines\t%lu\n",total_deadlines); + seq_printf(seq,"earlier deadlines\t%lu\n",earlier_deadlines); + seq_printf(seq,"cpu#\tdom_id\tcoop_poll#\tyields#\trendezvous#\twaker/wakee#\t\tio wakeups#\n"); + node = NULL; + /* XXX Can you call this without getting the runQ lock ?? The coop queues might be in an inconsistent state !!*/ + //find_nearest_global_deadline(&node); + //if(node != NULL) + // seq_printf(seq,"Global earliest deadline %d, %u,%u\n",node->pid,node->cf.coop_t.dead_p.t_deadline.tv_sec,node->cf.coop_t.dead_p.t_deadline.tv_usec); + for_each_online_cpu(cpu) { + for_each_available_coop_domain(dom_id) { + cq = cpu_cq(cpu,dom_id); + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d:\t%d\t%lu\t\t%lu\t\t" + "%lu/%lu/%lu\t\t" + "%lu/%lu\t\t" + "%lu/%lu/%lu/%lu/%lu/%lu/%lu/%lu", + cpu, dom_id,cq->num_coop_calls, cq->num_yields, + cq->num_rendezvous0, cq->num_rendezvous1, cq->num_rendezvous2, + cq->num_rendezvous_waker, cq->num_rendezvous_wakee, + cq->num_io_wakeups[0], + cq->num_io_wakeups[1], + cq->num_io_wakeups[2], + cq->num_io_wakeups[3], + cq->num_io_wakeups[4], + cq->num_io_wakeups[5], + cq->num_io_wakeups[6], + cq->num_io_wakeups[7] + ); + seq_printf(seq, "\n"); + node = NULL; + /* XXX Can you call this without getting the runQ lock ?? The coop queues might be in an inconsistent state !!*/ + //find_nearest_deadline(cq, &node); + //if(node != NULL) + // seq_printf("Domain id = %d, pid = %d, Deadline = %u.%u\n",dom_id,node->pid,node->cf.coop_t.dead_p.t_deadline.tv_sec,node->cf.coop_t.dead_p.t_deadline.tv_usec); + } + } + + return 0; +} + +static int coopstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + + char *buf; + + struct seq_file *m; + int res; + + /* don't ask for more than the kmalloc() max size, currently 128 KB */ + if (size > 128 * 1024) + size = 128 * 1024; + + buf = kmalloc(size, GFP_KERNEL); + + if (!buf) + return -ENOMEM; + res = single_open(file, show_coopstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_coopstat_operations = { + .open = coopstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +# define coopstat_inc(cq, field) do { (cq)->field++; } while (0) +# define coopstat_add(cq, field, amt) do { (cq)->field += (amt); } while (0) + +static void set_tsk_asap_info(struct task_struct *p, + struct timeval *t_asap_deadline, + int asap_prio, + struct timespec rank) +{ + p->cf.coop_t.asap_p.priority = asap_prio; + p->cf.coop_t.asap_p.t_asap = *t_asap_deadline; + p->cf.coop_t.asap_p.rank = rank; +} + +static void set_tsk_deadline_info(struct task_struct *p, + struct timeval *t_deadline, + struct timespec rank) +{ + p->cf.coop_t.dead_p.rank = rank; + p->cf.coop_t.dead_p.t_deadline = *t_deadline; +} + +/* Lock: Expects run Q lock to be held */ +static long __insert_into_asap_heap(coop_queue *cq,struct task_struct *p) +{ + p->cf.coop_t.coop_asap_heap_node = heap_insertt(cq->heap_asap, p, p); + + if(unlikely (!p->cf.coop_t.coop_asap_heap_node)) { + coop_print_debug("%s, %s:%d: Unable to allocate new memory for the new asap heap node!", + __FILE__, __FUNC__, __LINE__); + return -EFAULT; + } else return 0; +} /* __insert_into_asap_heap */ + +/* Expects run Q lock to be held */ +static long __insert_into_timeout_heap(coop_queue *cq,struct task_struct *p) +{ + + p->cf.coop_t.coop_deadline_heap_node = heap_insertt(cq->heap_deadline, p, p); + + /* Insert onto the global deadline heap */ + p->cf.coop_t.coop_deadline_global_heap_node = heap_insertt((get_task_bq(p)->global_coop_deadline_heap),p,p); + + if(unlikely ( (!p->cf.coop_t.coop_deadline_heap_node) || (!p->cf.coop_t.coop_deadline_global_heap_node))) { + coop_print_debug("%s, %s:%d: Unable to allocate new memory for the new deadline heap node!", + __FILE__, __FUNC__, __LINE__); + return -EFAULT; + } else return 0; + +} /* __insert_into_timeout_heap */ + +static long __insert_into_sleep_heap(coop_queue *cq,struct task_struct *p) +{ + p->cf.coop_t.coop_sleep_deadline_node = heap_insertt(cq->heap_coop_sleep, p, p); + /* Insert onto the global sleep heap */ + p->cf.coop_t.coop_sleep_deadline_global_heap_node = heap_insertt((get_task_bq(p)->global_coop_sleep_heap),p,p); + + if(unlikely ( (!p->cf.coop_t.coop_sleep_deadline_node) || (!p->cf.coop_t.coop_sleep_deadline_global_heap_node) )) { + coop_print_debug("%s, %s:%d: Unable to allocate new memory for the new deadline_sleep heap node!", + + __FILE__, __FUNC__, __LINE__); + return -EFAULT; + } else return 0; + +} /* __insert_into_coop_sleep_heap */ + +/* insert_task_into_asap_queue - insert any task into + * specified cpu coop asap queue. + * @t_asap_deadline: the deadline of the asap event + * @asap_prio: the priority of the asap event + * cq: the specified coop queue of any processor + * p: the specified task struct of the process to insert. + * Note: coop queue lock must be acquired before calling this function + */ + +static long insert_task_into_asap_queue(struct timeval *t_asap_deadline, + int asap_prio, coop_queue *cq, struct task_struct *p) +{ + struct timespec rank; + + g_assert(p); + g_assert(t_asap_deadline); + getnstimeofday(&rank); + + set_tsk_asap_info(p, t_asap_deadline, asap_prio,rank); + + return __insert_into_asap_heap(cq,p); +} +/* insert_task_into_asap_queue */ + +/* insert any task into any specified coop timeout queue + * @t_deadline: the deadline of the timeout event + * @cq: the coop queue of any processor + * @p: the pointer to the task struct of any process + * This function must be called with coop_queue lock held + */ +long insert_task_into_timeout_queue(struct timeval *t_deadline, + coop_queue *cq, struct task_struct *p,int fil_dead, unsigned int debug_reason) +{ + struct timespec rank; + + g_assert(p); + + if(fil_dead) { + g_assert(t_deadline); + getnstimeofday(&rank); + set_tsk_deadline_info(p, t_deadline,rank); + } + /* For debugging context switching and coop_poll + select */ + if(debug_reason < 8) { + cq->num_io_wakeups[debug_reason]++; + } /* if */ + return __insert_into_timeout_heap(cq,p); +} +/* insert_task_into_timeout_queue*/ + +long insert_task_into_sleep_queue(struct timeval *t_deadline, + coop_queue *cq, struct task_struct *p,int fil_dead) +{ + struct timespec rank; + + g_assert(p); + + if(fil_dead) { + g_assert(t_deadline); + getnstimeofday(&rank); + set_tsk_deadline_info(p, t_deadline,rank); + } + return __insert_into_sleep_heap(cq,p); +} +/* insert_task_into_timeout_queue*/ + +/* get_user_arg: + * copies the data from user space to the kernel space + * Note: copy_from_user actually reruns the number of bytes that was + * unable to be copied and 0 otherwise + * hence, in case it retuns a non-zero value, we simply return error + * @ki_param: input deadline parameters in kernel space + * @ui_param: input deadline parameters in user space + */ + +static int get_user_arg(struct coop_param_t *ki_param, + struct coop_param_t __user *ui_param) +{ + memset(ki_param, 0,sizeof(struct coop_param_t) ); + return copy_from_user(ki_param, ui_param, sizeof(struct coop_param_t))? -EFAULT:0; +} +/* get_user_arg */ + + +/* set_tsk_as_coop: + * marks a task as a coop task + * @tsk: the task_struct for the task + */ + +static void set_tsk_as_coop(struct task_struct* tsk, int dom_id) +{ + struct bvtqueue *bq; + int was_in_heap = 0; + /* multiple calls to this function should not mess with our + * accounting logic + */ + + if (is_coop_realtime(tsk)) { + return; + } + +#if defined(CONFIG_SMP) + /* Task migration is disabled for coop tasks.The notion of time is per cpu + * migrating coop tasks would screw up its dispatch timing */ + pin_coop_task(tsk); +#endif + + bq = cpu_bq(task_cpu(tsk)); + + /* remove the task from the bvt heap */ + was_in_heap = NULL != tsk->cf.task_sched_param->bheap_ptr; + if (was_in_heap) { + remove_task_from_bvt_queue(bq,tsk); + } + + set_coop_task(tsk); + + /* Now set the task's domain as the real time domain */ + tsk->cf.bvt_dom = &(bq->bvt_domains[dom_id]); + tsk->cf.dom_id = dom_id; + + /* the task virtual time is now the same as the domain virtual + * time + */ + tsk->cf.task_sched_param = &bq->bvt_domains[dom_id].dom_sched_param; + + /* increment the number of tasks in the real time domain if + * the task is runnnable. + */ + if (was_in_heap) { + bq->bvt_domains[DOM_BEST_EFFORT].num_tasks--; + bq->bvt_domains[DOM_BEST_EFFORT].num_weights -= tsk->se.load.weight; + bq->bvt_domains[dom_id].num_tasks++; + bq->bvt_domains[dom_id].num_weights += tsk->se.load.weight; + + /* if there is one coop guy running and this guy is + * runnable, borrow and insert the real coop node into + * the heap + */ + if (bq->bvt_domains[dom_id].num_tasks == 1) + { + /* Set domain's vt to be equal to this guy's */ + bq->bvt_domains[dom_id].dom_sched_param.bvt_virtual_time = tsk->cf.bvt_t.private_sched_param.bvt_virtual_time; + insert_task_into_bvt_queue(bq,tsk); + } /* if */ + } /* if */ +} +/* set_tsk_as_coop */ + +void set_tsk_as_temp_coop(struct task_struct* tsk) +{ + struct bvtqueue *bq; + + /* multiple calls to this function should not mess with our + * accounting logic + */ + if (is_coop_realtime(tsk)) { + return; + } + +#if defined(CONFIG_SMP) + /* Task migration is disabled for coop tasks.The notion of time is per cpu + * migrating coop tasks would screw up its dispatch timing */ + pin_coop_task(tsk); +#endif + + bq = cpu_bq(task_cpu(tsk)); + set_coop_task(tsk); + + /* Now set the task's domain as the real time domain */ + tsk->cf.bvt_dom = &(bq->bvt_domains[DOM_REALTIME_TEMP]); + tsk->cf.dom_id = DOM_REALTIME_TEMP; + + /* increment the number of tasks in the real time domain if + * the task is runnnable. + */ + if (tsk->cf.task_sched_param->bheap_ptr) { + bq->bvt_domains[DOM_BEST_EFFORT].num_tasks--; + bq->bvt_domains[DOM_BEST_EFFORT].num_weights -= tsk->se.load.weight; + bq->bvt_domains[DOM_REALTIME_TEMP].num_tasks++; + bq->bvt_domains[DOM_REALTIME_TEMP].num_weights += tsk->se.load.weight; + } + + } /* if */ + +/* remove_task_from_coop_queue: remove task from one or both the coop + * queues based on the argument flag. + * @tsk: The task to remove. + * @cq:The queue to remove from + * @which_queue: The flag, + * 0=> both timeout and asap queues, + * 1=> timeout queue, + * 2=> asap queue, + * 3=> coop sleep queue, + * -1=> do nothing. + */ +void remove_task_from_coop_queue(struct task_struct *tsk, + coop_queue *cq, + int which_queue) +{ + + if (which_queue <0) return; + + if ((which_queue == 1) || (which_queue == 0)) + { + if (tsk->cf.coop_t.coop_deadline_heap_node) { + heap_delete(cq->heap_deadline, + tsk->cf.coop_t.coop_deadline_heap_node); + tsk->cf.coop_t.coop_deadline_heap_node = NULL; + + g_assert(tsk->cf.coop_t.coop_deadline_global_heap_node); + heap_delete(get_task_bq(tsk)->global_coop_deadline_heap, + tsk->cf.coop_t.coop_deadline_global_heap_node); + tsk->cf.coop_t.coop_deadline_global_heap_node = NULL; + } + } + if ((which_queue == 2) || (which_queue == 0)) + { + if (tsk->cf.coop_t.coop_asap_heap_node) { + heap_delete(cq->heap_asap, + tsk->cf.coop_t.coop_asap_heap_node); + tsk->cf.coop_t.coop_asap_heap_node = NULL; + } + } + if (which_queue == 3) + { + if (tsk->cf.coop_t.coop_sleep_deadline_node) { + heap_delete(cq->heap_coop_sleep, + tsk->cf.coop_t.coop_sleep_deadline_node); + tsk->cf.coop_t.coop_sleep_deadline_node = NULL; + + g_assert(tsk->cf.coop_t.coop_sleep_deadline_global_heap_node); + heap_delete(get_task_bq(tsk)->global_coop_sleep_heap, + tsk->cf.coop_t.coop_sleep_deadline_global_heap_node); + tsk->cf.coop_t.coop_sleep_deadline_global_heap_node = NULL; + } /* if */ + } /* if */ + + +} +/* remove_task_from_coop_queue */ + +static inline void find_nearest_deadline(coop_queue *cq, struct task_struct **w_dead) +{ + if (!heap_is_empty(cq->heap_deadline)){ + *w_dead = (struct task_struct*) heap_min_data(cq->heap_deadline); + } +} /* find_nearest_deadline */ + +static inline void find_nearest_asap(coop_queue *cq, struct task_struct **w_asap) +{ + if (!heap_is_empty(cq->heap_asap)){ + *w_asap = (struct task_struct*) heap_min_data(cq->heap_asap); + } +} /* find_nearest_asap */ + +/* Return the nearest runnable global deadline task */ +void find_nearest_global_deadline(struct task_struct **overall) +{ + struct bvtqueue* bq = cpu_bq(smp_processor_id()); + if (!heap_is_empty(bq->global_coop_deadline_heap)) { + *overall = (struct task_struct*) heap_min_data(bq->global_coop_deadline_heap); + } + else *overall = NULL; +} + +void find_nearest_global_sleep(struct task_struct **overall) +{ + struct bvtqueue* bq = cpu_bq(smp_processor_id()); + if (!heap_is_empty(bq->global_coop_sleep_heap)) { + *overall = (struct task_struct*) heap_min_data(bq->global_coop_sleep_heap); + } + else *overall = NULL; + +} + + /* Return: Task with earliest global deadline (considering both runnable and sleeping tasks) or + * Null if heap is empty */ +void find_nearest_global_deadline_overall(struct task_struct **overall) +{ + struct task_struct *temp1 = NULL; + struct task_struct *temp2 = NULL; + + find_nearest_global_deadline(&temp1); + find_nearest_global_sleep(&temp2); + + if (temp1 && temp2) { + if (timeval_compare(&temp1->cf.coop_t.dead_p.t_deadline, + &temp2->cf.coop_t.dead_p.t_deadline) < 0) + *overall = temp1; + else + *overall = temp2; + } + else if (temp1) + *overall = temp1; + else if(temp2) + *overall = temp2; + else + *overall = NULL; +} + +void find_second_nearest_global_deadline_overall(struct task_struct **overall) +{ + struct bvtqueue* bq = cpu_bq(smp_processor_id()); + struct task_struct *temp1; + + if (!heap_is_empty(bq->global_coop_deadline_heap)) { + // Remove the top element of the global deadline heap + temp1 = (struct task_struct*)heap_delete_min(bq->global_coop_deadline_heap); + find_nearest_global_deadline_overall(overall); + // Re-insert top element into the heap + heap_insertt(bq->global_coop_deadline_heap,temp1,temp1); + } + else + find_nearest_global_sleep(overall); + +} + +static inline void find_nearest_coop_sleep_deadline(coop_queue *cq, + struct task_struct **w_sleep) +{ + if (!heap_is_empty(cq->heap_coop_sleep)){ + /* yes, there are sleeping tasks, take the + * earliest deadline of the sleeping tasks */ + *w_sleep = (struct task_struct*) heap_min_data(cq->heap_coop_sleep); + } +} +/* find_nearest_coop_sleep_deadline */ + + +/* find_nearest_deadline_asap: + * finds the nearest deadlines and asaps from the coop heap + * of the current CPU. + * Note: does not lock the heaps, use carefully + */ + +static void find_nearest_deadline_asap(coop_queue *cq, + struct task_struct **w_dead, + struct task_struct **w_asap) +{ + find_nearest_asap(cq, w_asap); + find_nearest_deadline(cq, w_dead); +} /* find_nearest_deadline_asap */ + + +/* find_coop_period: + * This function finds the coop period for the "next" + * task. + * @next: The next bvt task that we want to run. + * @next_coop: The next most important real coop task in + * the heap. Output parameter. + * @coop_prd: The coop period determined. Output parameter. + * Returns 1 if the coop period is set. Returns 0 otherwise. + */ +int find_coop_period(struct task_struct *next, + struct task_struct **next_coop, + struct timespec *coop_prd) +{ + struct timeval tv_now; + struct task_struct *next_earliest_deadline_task = NULL; + struct bvtqueue *bq = cpu_bq(smp_processor_id()); + + + + /* Find the nearest global deadline, considering both + * runnable and sleeping tasks */ + find_nearest_global_deadline_overall(&next_earliest_deadline_task); + + if(next_earliest_deadline_task) { + tv_fairshare_now_adjusted(&tv_now); + + if ( (timeval_compare(&(next_earliest_deadline_task->cf.coop_t.dead_p.t_deadline),&(tv_now)) <0)) { + set_normalized_timespec(coop_prd, 0,0); + } + else { + set_normalized_timespec(coop_prd, + next_earliest_deadline_task->cf.coop_t.dead_p.t_deadline.tv_sec - tv_now.tv_sec, + (next_earliest_deadline_task->cf.coop_t.dead_p.t_deadline.tv_usec - tv_now.tv_usec)*NSEC_PER_USEC); + + } + return 1; + } + else return 0; + +} +/* find_coop_period */ + +/* choose_next_coop: + * This function chooses the next coop process to run + * from the coop domain id specified as the argument. + * If there is an immediate task whose deadline has + * expired, it returns 0. However, if there is a task + * whose deadline is in the future and there are no asaps + * in the asap queue, it returns the time left in a timespec + * val. The function does not sleep! + * @w_dead: the deadline node at the top of the heap + * @w_asap: the asap node at the top of the heap. + * @target_task: output parameter specifying the next coop task to + * run (if any). + */ +void choose_next_coop(struct task_struct** target_task, int dom_id) +{ + struct timeval tv_now; + struct task_struct *w_dead = NULL; + struct task_struct *w_asap = NULL; + + coop_queue *cq = cpu_cq(smp_processor_id(), dom_id); + + //do_gettimeofday(&tv_now); + tv_fairshare_now_adjusted(&tv_now); + + find_nearest_deadline_asap(cq, &w_dead,&w_asap); + + /* if we have both timeout and asaps in the heap we check if the deadline of + * the timeout event has expired. If yes, we run the timeout event. If no, + * we run the asap event. The same logic is used in the main event loop + * while choosing which event to dispatch next. + */ + + if (w_dead && w_asap) { + + /* both asap and timeout event found */ + + if(timeval_compare(&w_dead->cf.coop_t.dead_p.t_deadline, &tv_now) < 0) { + + /* If deadline expired run deadline event + */ + *target_task = w_dead; + + } else { + /* deadline did not expire, run asap event, + */ + *target_task = w_asap; + } + } else if (w_dead) { + + *target_task = w_dead; + + }else if (w_asap) { + + /* only asap event in the heap, just run the asap event */ + + *target_task = w_asap; + + }else { + /* Both heaps are empty */ + *target_task = NULL; } + + return; +} +/* choose_next_coop*/ + +/** + * set_normalized_timeval - set timespec sec and nsec parts and normalize + * + * @tv: pointer to timesval variable to be set + * @sec: seconds to set + * @usec: microseconds to set + * + * Set seconds and microseconds field of a timesval variable and + * normalize to the timesval storage format + * + * Note: The tv_usec part is always in the range of + * 0 <= tv_usec < USEC_PER_SEC + * For negative values only the tv_sec field is negative ! + * This implementation is stolen from kernel/time.c + */ +void set_normalized_timeval(struct timeval *tv, + time_t sec, long usec) +{ + while (usec >= USEC_PER_SEC) { + usec -= USEC_PER_SEC; + ++sec; + } + while (usec < 0) { + usec += USEC_PER_SEC; + --sec; + } + tv->tv_sec = sec; + tv->tv_usec = usec; +} + +/* sys_coop_poll - The coop_poll system call interface + * @i_param: input deadline parameters + * @o_param: output deadline parameters + * + * This version of the coop_poll does not force a process to + * sleep on a completion variable/waitqueue. Instead, it just + * calls schedule() to yield the + * processor to another coop task. + * This system call is thus, very similar to sched_yield() call + * that also yields the processor for a very small amount of + * time. In a combined fairshare & coop heuristic, this is + * expected to result in a much more simpler code and is + * conceptually more consistent with what we want it + * to do. + */ +asmlinkage long sys_coop_poll(struct coop_syscall_t __user *param +#if 0 +struct coop_param_t __user *i_param, + struct coop_param_t __user *o_param, + int dom_id, + int n, + struct coop_fds_t __user *fds +#endif + ) +{ + + struct coop_param_t ki_param; + struct coop_param_t ko_param; + int ret; + int ret1; + unsigned short yields = 0; + unsigned long flags; + int flg =0,flg2 =0; + int valid_dom_id = 0; + struct timeval tv_now,tv_diff; + struct timespec tomono,ts_diff; + coop_queue *cq; + struct bvtqueue *bq; + struct task_struct *w_asap; + struct timeval tv_zero = { .tv_sec = 0, .tv_usec = 0}; + struct task_struct *upcoming_deadline_task = NULL; + mm_segment_t oldmm; + fd_set_bits fdset; + int max_fds; + unsigned int size; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + void *bits = stack_fds; + s64 timeout = -1; + struct timeval tv_last_deadline; + struct timeval tv_io_lat; + ktime_t ktime_timeout; + + struct coop_syscall_t k_param; + struct coop_param_t __user *i_param; + struct coop_param_t __user *o_param; + int dom_id; + int n; + struct coop_fds_t __user *fds; + enum COOP_POLL_STATUS *status; + enum COOP_POLL_STATUS reason; + + if(copy_from_user(&k_param, param, sizeof(k_param))) { + return -EFAULT; + } /* if */ + + dom_id = k_param.dom_id; + n = k_param.n; + fds = k_param.fds; + i_param = k_param.i_param; + o_param = k_param.o_param; + status = k_param.status; + + ko_param.t_deadline = tv_zero; + ko_param.t_asap = tv_zero; + ko_param.p_asap = 0; + ko_param.have_asap = 0; + /* The task is informing the kernel that its about to do an uncontrolled sleep */ + if (dom_id == INFORM_ABT_SLEEP) { + /* Disable user level control of about_to_sleep flag for now*/ + //current->cf.bvt_t.about_to_sleep = 1; + return 0; + } + + if(current==NULL) { + printk(KERN_ERR "coop poll: current == NULL?\n"); + return -EINVAL; + } + + /* Sanity checks */ + /* check for valid domain id*/ + valid_dom_id = ((dom_id >= 0 && + dom_id <= (NR_COOP_DOMAINS -1)) || dom_id == DOM_LEAVE); + + if (!valid_dom_id) { + printk(KERN_ERR "coop poll: tsk=%d: invalid dom id: %d\n", + current->pid, dom_id); + return -EINVAL; + } + + if (dom_id == DOM_LEAVE) { + if (i_param != NULL || o_param !=NULL) { + printk(KERN_ERR "coop poll: tsk=%d: bad args to DOM_LEAVE\n", + current->pid); + return -EINVAL; + } + } else { + if (i_param==NULL || o_param==NULL ) { + printk(KERN_ERR "coop poll: tsk=%d: bad args to DOM_LEAVE\n", + current->pid); + return -EINVAL; + } + } + + if(!is_bvt(current)) { + printk(KERN_ERR "Non faircoop tsk=%d,using coop_poll()\n", + current->pid); + return -EINVAL; + } + + /* Only a coop realtime task can do a dom_leave call */ + if (!is_coop_realtime(current) && dom_id == DOM_LEAVE) { + printk(KERN_ERR "coop poll: tsk=%d: only a coop realtime task can do a dom_leave\n", + current->pid); + return -EINVAL; + } + + /* A coop realtime task is not allowed to change its domain mid-way*/ + if (is_coop_realtime(current) && (dom_id != DOM_LEAVE) && (task_domain(current) != DOM_BEST_EFFORT)) { + if (dom_id != task_domain(current)) { + printk(KERN_ERR "coop poll: tsk=%d: not allowed to change domain %d != %d\n", + current->pid, + dom_id, task_domain(current)); + return -EINVAL; + } + } + + /* If interested in monitoring io, fds cannot be NULL */ + if (n > 0 && !fds) { + printk(KERN_ERR "coop poll: tsk=%d: select fds == NULL\n", + current->pid); + return -EINVAL; + } + + /* End of sanity checks */ + if (dom_id == DOM_LEAVE) { + cq = task_cq_lock(current, dom_id, &flags, &bq); + demote_task(bq,current,COOP_POLL_BEST_EFFORT); + cq_unlock(bq, &flags); + return 0; + } + + /* copy input values to kernel space + * this is where the kernel checks for memory access violations + * This might put the process to sleep + */ + ret = get_user_arg(&ki_param, i_param); + if (ret) + return ret; + + /* If interested in monitoring io, get the fds*/ + if (n > 0) { + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + bits = kmalloc(6 * size, GFP_KERNEL); + if (!bits) + return -ENOMEM; + } + fdset.in = bits; + fdset.out = bits + size; + fdset.ex = bits + 2*size; + fdset.res_in = bits + 3*size; + fdset.res_out = bits + 4*size; + fdset.res_ex = bits + 5*size; + + if ((ret = get_fd_set(n, fds->inp, fdset.in)) || + (ret = get_fd_set(n, fds->outp, fdset.out)) || + (ret = get_fd_set(n, fds->exp, fdset.ex))) { + ret = -EFAULT; + goto out; + } + zero_fd_set(n, fdset.res_in); + zero_fd_set(n, fdset.res_out); + zero_fd_set(n, fdset.res_ex); + } + + /* acquire the coop queue lock. + * this disabled preemption and local IRQs and acquires + * the runqueue spinlock + */ + cq = task_cq_lock(current, dom_id, &flags, &bq); + + /* Wall to monotonic time conversion factor*/ + tomono = wall_to_monotonic; + + /* update the number of coop count in coop queue */ + cq->num_coop_calls++; + + do_gettimeofday(&tv_now); + + set_normalized_timeval(&tv_last_deadline, + current->cf.coop_t.deadline.tv_sec - tomono.tv_sec, + ((current->cf.coop_t.deadline.tv_nsec - tomono.tv_nsec)/NSEC_PER_USEC)); + + + /* Check if this a rendevouz call + * If there is a deadline in the past or there is asap without the asap yield flag asserted + * and the task's timeslice is not done yet + * then just do a rendevouz, DO NOT yield the cpu */ + if(bq->rendezvous) { + cq->num_rendezvous0++; + if(COOP_POLL_OK == bq->reason) { + cq->num_rendezvous1++; + if(GET_HAVE_ASAP(ki_param)) { + cq->num_rendezvous2++; + ret = 0; + goto send_to_userland; + } + } + } + + /* Not needed, since a runnning task doesn't have its info in the queues*/ + /* remove my stale nodes from the coop heaps and re-insert new + * nodes based on my updated information + */ + remove_task_from_coop_queue(current, cq,0); + + /* Insert my info into the heap */ + + if ( likely(timeval_compare(&ki_param.t_deadline,&tv_zero) != 0) ) { + + total_deadlines++; + + ret = insert_task_into_timeout_queue(&ki_param.t_deadline, + cq, current,1,0); + if (unlikely(ret < 0)){ + remove_task_from_coop_queue(current, cq,0); + cq_unlock(bq, &flags); + goto out; + } + flg2 = 1; + } /* if */ + + flg = 0; /* will be = 1 only when there are asaps */ + + if ( likely(GET_HAVE_ASAP(ki_param))) { + ret = insert_task_into_asap_queue(&ki_param.t_asap, + ki_param.p_asap, + cq, + current); + if (ret < 0) { + remove_task_from_coop_queue(current, cq,0); + cq_unlock(bq, &flags); + goto out; + } else { + flg = 1; + } + } /* if */ + + if (!is_coop_realtime(current) && (flg || flg2)) { + if (dom_id != DOM_REALTIME_TEMP) + set_tsk_as_coop(current, dom_id); + else + set_tsk_as_temp_coop(current); + } + else if (is_coop_realtime(current) && !flg && !flg2 + && (n <= 0)) { + /* No deadline info given, demote the task */ + demote_task(bq,current,COOP_POLL_BEST_EFFORT); + } + + /* Default return value, might get overriden below*/ + ret = 0; + + /* if there are no asaps and the deadline is in the + * future, make the process sleep in coop_poll() until + * the deadline expires + */ + if (!flg && timeval_compare(&ki_param.t_deadline,&tv_now) > 0) { + + /*calculate the time difference*/ + set_normalized_timespec(&ts_diff, + ki_param.t_deadline.tv_sec - tv_now.tv_sec, + (ki_param.t_deadline.tv_usec - tv_now.tv_usec)*1000); + + /* Not interested in monitoring io, just sleep*/ + if (n <= 0) { + cq_unlock(bq,&flags); + current->cf.coop_t.is_well_behaved = 1; +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d doing nanosleep %d %d %d %d %d\n", + current->pid, + is_coop_realtime(current), + current->cf.bvt_t.pseudo_sleep, + current->cf.bvt_t.about_to_sleep, + current->se.on_rq, + current->cf.task_sched_param->bheap_ptr); + oops_in_progress = 0; +#endif + /* TODO Take care of the call being interrupted by a signal */ + hrtimer_nanosleep(&ts_diff,NULL,HRTIMER_MODE_REL,CLOCK_MONOTONIC); +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d ret from nanosleep\n",current->pid); + oops_in_progress = 0; +#endif + + current->cf.coop_t.is_well_behaved = 0; + goto wakeup; + } + /* Monitor io using the select call */ + else { + cq_unlock(bq,&flags); + current->cf.bvt_t.about_to_sleep = 1; + current->cf.coop_t.is_well_behaved = 1; + ktime_timeout = timespec_to_ktime(ts_diff); + +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "[%u.%u] %d select1 %u.%u\n",tv_now.tv_sec,tv_now.tv_usec,current->pid,ts_diff.tv_sec,ts_diff.tv_nsec); + oops_in_progress = 0; +#endif + + ret = high_res_select(n,&fdset,&ktime_timeout); + +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + do_gettimeofday(&tv_now); + printk(KERN_ERR "[%u.%u] %d ret %d select1\n",tv_now.tv_sec, tv_now.tv_usec,current->pid,ret); + oops_in_progress = 0; +#endif + + current->cf.bvt_t.about_to_sleep = 0; + current->cf.coop_t.is_well_behaved = 0; + goto wakeup; + } + + } + /* No asap and no deadline info, monitor_io == 1 + * Sleep in select infinitely*/ + else if (!flg && (timeval_compare(&ki_param.t_deadline, &tv_zero) == 0 ) && (n > 0)) { + cq_unlock(bq,&flags); + current->cf.bvt_t.about_to_sleep = 1; + current->cf.coop_t.is_well_behaved = 1; +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d select2\n",current->pid); + oops_in_progress = 0; +#endif + + ret = high_res_select(n,&fdset,NULL); + +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d ret %d select2\n",current->pid,ret); + oops_in_progress = 0; +#endif + + current->cf.bvt_t.about_to_sleep = 0; + current->cf.coop_t.is_well_behaved = 0; + goto wakeup; + } + + cq_unlock(bq, &flags); +yield: + if (n > 0) { + /* Yield via a select call */ + /* Inform the dequeue function to not remove this task + * from the bvt heap */ + /* This might not yield the cpu, how does one ensure that ? */ + current->cf.bvt_t.pseudo_sleep = 1; + +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d select3\n",current->pid); + oops_in_progress = 0; +#endif + + ret = high_res_select(n,&fdset,NULL); + +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d ret %d select3\n",current->pid,ret); + oops_in_progress = 0; +#endif + + current->cf.bvt_t.pseudo_sleep = 0; + } + else + schedule(); /* this is how we now yield */ +wakeup: + yields++; + /* Update time*/ + do_gettimeofday(&tv_now); + + cq = task_cq_lock(current, dom_id, &flags, &bq); + + /* update the current running task node */ + cq->num_yields += yields; + +send_to_userland: + +#if 1 + remove_task_from_coop_queue(current,cq,0); +#endif + + w_asap = NULL; + + find_nearest_asap(cq, &w_asap); + + if (w_asap) { + ko_param.t_asap = w_asap->cf.coop_t.asap_p.t_asap; + ko_param.p_asap = w_asap->cf.coop_t.asap_p.priority; + SET_HAVE_ASAP(ko_param); + } + + reason = bq->reason; + + bq->reason = COOP_POLL_OK; + + /* adjust for wall time, monotonic time difference */ + set_normalized_timeval(&ko_param.t_deadline, + current->cf.coop_t.deadline.tv_sec - tomono.tv_sec, + ((current->cf.coop_t.deadline.tv_nsec - tomono.tv_nsec)/NSEC_PER_USEC)); + + set_normalized_timeval(&tv_io_lat, + tv_now.tv_sec + bq->ts_io_lat.tv_sec , + tv_now.tv_usec + (bq->ts_io_lat.tv_nsec / NSEC_PER_USEC)); + + if (timeval_compare(&tv_io_lat, &ko_param.t_deadline) < 0 ) { + ko_param.t_deadline = tv_io_lat; + bq->rendezvous = 1; + } else { + bq->rendezvous = 0; + } + + cq_unlock(bq, &flags); + + /* send it to user process */ + ret1 = copy_to_user(o_param, &ko_param, sizeof(struct coop_param_t)); + if (ret1) { + ret = -EFAULT; + oops_in_progress = 1; + printk(KERN_ERR "Error copying stuff to user space\n"); + oops_in_progress = 0; + goto out; + } + else { +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "Ret from coop\n"); + oops_in_progress = 0; +#endif + } + + if(n > 0) { + if (set_fd_set(n, fds->inp, fdset.res_in) || + set_fd_set(n, fds->outp, fdset.res_out) || + set_fd_set(n, fds->exp, fdset.res_ex)) { + printk("%d Error copying out fds\n",current->pid); + ret = -EFAULT; + goto out; + } + } /* if */ + +#if 0 + /* Safety net, to maintain the invariant - + * running tasks deadline/asap info is NOT present in the + * kernel data structures*/ + /* Check if task is still a coop realtime guy, it might have + * got policed already due to some inadverdent sleeps*/ + if (is_coop_realtime(current)) { + cq = task_cq_lock(current, dom_id, &flags, &bq); + remove_task_from_coop_queue(current,cq,0); + cq_unlock(bq, &flags); + } + else { +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d Task got demoted inside coop poll\n",current->pid); + oops_in_progress = 0; +#endif + + } +#endif /* 0 */ + + ret1 = copy_to_user(status, &reason, sizeof(reason)); + if (ret1) { + ret = -EFAULT; + oops_in_progress = 1; + printk(KERN_ERR "Error copying stuff to user space\n"); + oops_in_progress = 0; + goto out; + } + + +out: + if(bits != stack_fds) + kfree(bits); + + return ret; +} +/* sys_coop_poll */ + +/* This is the comparison function that determines the sort order of + * the asap_events heap. The order is determined first by priority, + * then by deadline, finally FIFO. + * This function, the way it is coded, is tricky. Notice the following: + * The function basically compares two ASAP events and returns true or false + * according as + * ev_a > ev_b ? true:false + * However, ev_a > ev_b means that eb_b should be served first and then ev_a + * because either its priority is higher or deadline or its rank is lesser than + * that for ev_a. Thus, our heap is actually sorted in ascending order. + * Note also that priority of events are in positive logic, that is higher value + * means higher priority and vice versa. + * In short: + * rank (ev_a) > rank (ev_b) means ev_b arrived earlier in queue than ev_a + * deadline(ev_a) > deadline(ev_b) means ev_b has a deadline earlier thab ev_a + * prio(ev_a) < prio(ev_b) means ev_a has a lower priority than ev_b. + */ + +gboolean coop_poll_asap_gt(heap_key_t a, heap_key_t b) +{ + struct task_struct *ev_a = a; + struct task_struct *ev_b = b; + + if(ev_a->cf.coop_t.asap_p.priority == ev_b->cf.coop_t.asap_p.priority) { + if(!timeval_compare(&ev_a->cf.coop_t.asap_p.t_asap, + &ev_b->cf.coop_t.asap_p.t_asap)) { + /* FIFO ordering */ + return(timespec_compare(&ev_a->cf.coop_t.asap_p.rank, + &ev_b->cf.coop_t.asap_p.rank) > 0); + } else { + /* Application can specify a deadline to + * distinguish events of equal priority. + * Earlier deadlines get served first. */ + return(timeval_compare(&ev_a->cf.coop_t.asap_p.t_asap, + &ev_b->cf.coop_t.asap_p.t_asap)>0); + } /* else */ + } else { + /* primary key: priority order (descending) */ + return(ev_a->cf.coop_t.asap_p.priority < + ev_b->cf.coop_t.asap_p.priority); + } /* else */ +} +/* coop_poll_asap_gt */ + + + +/* The following function is the comparison function for timeout events + * The events are sorted first by their deadlines and then by FIFO order. + * The way the function is coded might be a little confusing to the readers. + * Check the comments for the asap comparison function for a detailed explanation + * as to the signifance of the value returned. + * In short, rank (ev_a) > rank(ev_b) means ev_b has a higher priority because + * it arrived earlier in the queue. + * deadline(ev_a) > deadline (ev_b) means that ev_b has a deadline closer in the + * future than ev_a. + */ + +gboolean coop_poll_timeout_gt(heap_key_t a, heap_key_t b) +{ + struct task_struct *ev_a = a; + struct task_struct *ev_b = b; + + g_assert(ev_a); + g_assert(ev_b); + + if(unlikely(!timeval_compare(&ev_a->cf.coop_t.dead_p.t_deadline, + &ev_b->cf.coop_t.dead_p.t_deadline))) { + /* FIFO ordering */ + return(timespec_compare(&ev_a->cf.coop_t.dead_p.rank, + &ev_b->cf.coop_t.dead_p.rank) > 0); + } else { + /* primary key: priority order (descending) */ + return(timeval_compare(&ev_a->cf.coop_t.dead_p.t_deadline, + &ev_b->cf.coop_t.dead_p.t_deadline) > 0 ); + } /* else */ + +} +/* coop_poll_timeout_gt */ + + +/* coop_proc_init: + * initialize coop data structure when a process is forked + * @p: the task_struct of the process being initiaized. + */ + +void coop_proc_init(struct task_struct *p) +{ + p->cf.bvt_dom = NULL; + memset (&p->cf.coop_t, 0, sizeof(struct coop_struct)); +} +/*coop_proc_init */ + +#else /* !defined SCHED_COOPREALTIME */ +asmlinkage long sys_coop_poll(struct coop_syscall_t __user *param) +{ + + printk(KERN_WARNING "coop_poll called but kernel has not been" + "compiled with coop_poll support\n"); + + return -ENOSYS; /* system call not implemented */ +} + +void coop_proc_init(struct task_struct *p) {} +#endif /* !defined (SCHED_COOPREALTIME) */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ab80515..c6bec43 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,9 +44,13 @@ #include #include #include - #include +#if defined(CONFIG_SCHED_COOPREALTIME) +#include +#include +#endif + /** * ktime_get - get the monotonic time in ktime_t format * @@ -1060,7 +1064,7 @@ int hrtimer_cancel(struct hrtimer *timer) { for (;;) { int ret = hrtimer_try_to_cancel(timer); - + if (ret >= 0) return ret; cpu_relax(); @@ -1459,6 +1463,53 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) #endif } +/** + * schedule_timeout_hr - sleep until timeout + * @timeout: timeout value + * + * Make the current task sleep until @timeout has elapsed. + * The routine will return immediately unless the current task + * state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * In all cases the return value is guaranteed to be a non-negative + * time value. + */ +static ktime_t __sched __schedule_timeout_hr(ktime_t time) +{ + struct hrtimer_sleeper t; + ktime_t remain; + + hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_sleeper(&t, current); + hrtimer_start(&t.timer, time, HRTIMER_MODE_REL); + schedule(); + hrtimer_cancel(&t.timer); + remain = hrtimer_get_remaining(&t.timer); + + if (ktime_to_ns(remain) < 0) + return ktime_set(0, 0); + else + return remain; +} + +ktime_t __sched schedule_timeout_hr(ktime_t time) +{ + return __schedule_timeout_hr(time); +} +EXPORT_SYMBOL_GPL(schedule_timeout_hr); + static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { hrtimer_init_sleeper(t, current); @@ -1561,19 +1612,69 @@ out: return ret; } +#if defined(CONFIG_SCHED_COOP_NANOSLEEP) asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) { struct timespec tu; + struct timeval deadline; + struct timeval tv_now; + struct bvtqueue *bq; + coop_queue *cq; + int was_cooprealtime; + unsigned long flags; + long ret; if (copy_from_user(&tu, rqtp, sizeof(tu))) return -EFAULT; if (!timespec_valid(&tu)) return -EINVAL; + + /* This enhancement is only for tasks belong to our scheduling + * class */ + /* Coop real time tasks do aren't allowed the special sys_nanosleep. + * They can use the sys_coop_poll call itself for doing a nanosleep */ + if (is_bvt(current) && !is_coop_realtime(current)) { + + bq = get_task_bq_locked(current,&flags); + do_gettimeofday(&tv_now); + set_normalized_timeval(&deadline, tu.tv_sec + tv_now.tv_sec, ((tu.tv_nsec + tv_now.tv_usec*NSEC_PER_USEC)/NSEC_PER_USEC)); + set_tsk_as_temp_coop(current); + cq = &(bq->cq[DOM_REALTIME_TEMP]); + + ret = insert_task_into_timeout_queue(&deadline,cq,current,1); + if (unlikely(ret < 0)){ + demote_task(bq,current); + put_task_bq_locked(bq, &flags); + return ret; + } + + /* Give up the lock*/ + put_task_bq_locked(bq,&flags); + current->cf.coop_t.is_well_behaved = 1; + ret = hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + current->cf.coop_t.is_well_behaved = 0; + return ret; + } + else + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} +#else +asmlinkage long +sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) +{ + struct timespec tu; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + if (copy_from_user(&tu, rqtp, sizeof(tu))) + return -EFAULT; + + if (!timespec_valid(&tu)) + return -EINVAL; + + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } +#endif /* * Functions related to boot-time initialization: diff --git a/kernel/printk.c b/kernel/printk.c index e2129e8..e14c72a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1001,8 +1001,10 @@ int is_console_locked(void) void wake_up_klogd(void) { + #if 0 if (!oops_in_progress && waitqueue_active(&log_wait)) wake_up_interruptible(&log_wait); + #endif } /** diff --git a/kernel/sched.c b/kernel/sched.c index 4e2f603..a92de95 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,7 @@ #include /* + * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -402,6 +403,7 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ #endif + }; /* Real-Time classes' related field in a runqueue: */ @@ -560,6 +562,11 @@ struct rq { unsigned int bkl_count; #endif struct lock_class_key rq_lock_key; + +#if defined (CONFIG_SCHED_COOPREALTIME) + struct bvtqueue bq; +#endif + }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -579,6 +586,7 @@ static inline int cpu_of(struct rq *rq) } /* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. * @@ -1494,6 +1502,12 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" + +#if defined(CONFIG_SCHED_COOPREALTIME) +#include "sched_fairshare.c" +#include "coop_poll.c" +#endif + #include "sched_rt.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" @@ -1504,23 +1518,42 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static inline void inc_load(struct rq *rq, const struct task_struct *p) { update_load_add(&rq->load, p->se.load.weight); + #if defined(CONFIG_SCHED_COOPREALTIME) + if (is_bvt(p) && !p->cf.bvt_t.pseudo_sleep) + rq->bq.sum_weights += p->se.load.weight; + #endif + } static inline void dec_load(struct rq *rq, const struct task_struct *p) { update_load_sub(&rq->load, p->se.load.weight); + #if defined(CONFIG_SCHED_COOPREALTIME) + if (is_bvt(p) && !p->cf.bvt_t.pseudo_sleep) + rq->bq.sum_weights -= p->se.load.weight; + #endif + } static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; inc_load(rq, p); + #if defined(CONFIG_SCHED_COOPREALTIME) + if (is_bvt(p) && !p->cf.bvt_t.pseudo_sleep) + rq->bq.nr_running++; + #endif } static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; dec_load(rq, p); + #if defined(CONFIG_SCHED_COOPREALTIME) + if (is_bvt(p) && !p->cf.bvt_t.pseudo_sleep) + rq->bq.nr_running--; + #endif + } static void set_load_weight(struct task_struct *p) @@ -1557,6 +1590,12 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) p->se.on_rq = 0; } +#if defined(CONFIG_SCHED_COOPREALTIME) +#define default_sched_class (faircoop_sched_class) +#else +#define default_sched_class (fair_sched_class) +#endif + /* * __normal_prio - return the priority that is based on the static prio */ @@ -1613,6 +1652,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) enqueue_task(rq, p, wakeup); inc_nr_running(p, rq); + } /* @@ -1622,9 +1662,15 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; + if (!p->se.on_rq) { + spin_unlock(&rq->lock); + printk(KERN_ERR "Task %d already deactivated\n",p->pid); + BUG_ON(1); + } dequeue_task(rq, p, sleep); dec_nr_running(p, rq); + } /** @@ -1644,8 +1690,28 @@ unsigned long weighted_cpuload(const int cpu) static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { + unsigned int old_cpu = task_cpu(p); set_task_rq(p, cpu); -#ifdef CONFIG_SMP + #if defined(CONFIG_SCHED_COOPREALTIME) + /* Change the bvt domain for the task, when a task migrates + * Don't worry about coop tasks, since they are pinned*/ + if(is_bvt(p) && old_cpu!=cpu && !p->se.on_rq) { + if (p->cf.task_sched_param->bheap_ptr) { + oops_in_progress = 1; + printk(KERN_ERR "In heap mod %d\n",p->pid); + BUG(); + oops_in_progress = 0; + } + p->cf.bvt_dom = &(cpu_bq(cpu)->bvt_domains[task_domain(p)]); + /* The virtual times on both cpus are running at different rates + * Reset it to the max on this cpu. Again don't bother about coop tasks + * since they are pinned*/ + set_normalized_timespec(&p->cf.task_sched_param->bvt_virtual_time, + cpu_bq(cpu)->max_virtual_time.tv_sec, + cpu_bq(cpu)->max_virtual_time.tv_nsec); + } + #endif + /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfuly executed on another CPU. We must ensure that updates of @@ -1653,7 +1719,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); task_thread_info(p)->cpu = cpu; -#endif } static inline void check_class_changed(struct rq *rq, struct task_struct *p, @@ -1684,7 +1749,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) return 1; - if (p->sched_class != &fair_sched_class) + if (p->sched_class != &default_sched_class) return 0; if (sysctl_sched_migration_cost == -1) @@ -2158,7 +2223,6 @@ out_running: #endif out: task_rq_unlock(rq, &flags); - return success; } @@ -2207,6 +2271,12 @@ static void __sched_fork(struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#if defined(CONFIG_SCHED_COOPREALTIME) + coop_proc_init(p); + bvt_proc_init(p); + init_bvt_domain(cpu_bq(task_cpu(p)),p); +#endif + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -2229,13 +2299,12 @@ void sched_fork(struct task_struct *p, int clone_flags) cpu = sched_balance_self(cpu, SD_BALANCE_FORK); #endif set_task_cpu(p, cpu); - /* * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; if (!rt_prio(p->prio)) - p->sched_class = &fair_sched_class; + p->sched_class = &default_sched_class; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2278,6 +2347,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) */ p->sched_class->task_new(rq, p); inc_nr_running(p, rq); + } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -2402,6 +2472,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul */ prev_state = prev->state; + #if defined(CONFIG_SCHED_COOPREALTIME) + /* Has to be done with Run q lock held, modifying heaps */ + if (unlikely(prev_state == TASK_DEAD && is_coop_realtime(prev))) + remove_task_from_coop_queue(prev,&(rq->bq.cq[prev->cf.dom_id]),3); + #endif + finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP @@ -3886,7 +3962,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; - + p->utime = cputime_add(p->utime, cputime); /* Add user time to cpustat. */ @@ -4159,6 +4235,11 @@ need_resched_nonpreemptible: schedule_debug(prev); hrtick_clear(rq); + /* Cancel the bvt timer outside of the rq lock*/ +#if defined(CONFIG_SCHED_COOPREALTIME) + if (is_bvt(prev)) + bvt_timer_cancel(&rq->bq.bvt_timer,rq); +#endif /* * Do the rq-clock update outside the rq lock: @@ -4545,7 +4626,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (rt_prio(prio)) p->sched_class = &rt_sched_class; else - p->sched_class = &fair_sched_class; + p->sched_class = &default_sched_class; p->prio = prio; @@ -4729,7 +4810,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: - p->sched_class = &fair_sched_class; + p->sched_class = &default_sched_class; break; case SCHED_FIFO: case SCHED_RR: @@ -5632,6 +5713,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); + if (on_rq) { activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p); @@ -7569,7 +7651,130 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } - +#if defined(CONFIG_SCHED_COOPREALTIME) +/* coop_init: + * Initialize the per cpu data structures + * for coop_poll + */ +void coop_init(int cpu) +{ + coop_queue *cq; + int dom_id; + struct rq *rq = cpu_rq(cpu); + struct bvtqueue* bq = &(rq->bq); + printk(KERN_ERR "Initializing coop heaps for %d\n",cpu); + for_each_available_coop_domain(dom_id) { + cq = &(bq->cq[dom_id]); + /* Allocating memory here has the disadvantage + * that regardless of whether we have a + * process using coop_poll, we allocate memory + * for the two coop_poll heaps. Thus, this + * may not be very memory efficient. + */ + cq->heap_asap = create_heap (coop_poll_asap_gt, HEAP_INIT_CAP); + cq->heap_deadline = create_heap (coop_poll_timeout_gt, HEAP_INIT_CAP); + cq->heap_coop_sleep = create_heap (coop_poll_timeout_gt, HEAP_INIT_CAP); + + if (unlikely((!cq->heap_asap) + || (!cq->heap_deadline) + || (!cq->heap_coop_sleep)) ) { + /* memory allocation error */ + panic("unable to allocate memory for the coop heaps"); + + } + cq->num_coop_calls = 0; + cq->num_yields = 0; + } /* COOP domain for */ + return; +} +/* coop_init */ + +/* initializes the global bvt parameters */ +void bvt_global_init(int cpu) +{ + struct bvtqueue *bq; + unsigned long flags; + int j; + heap_t *temp1; + heap_t *temp2; + heap_t *temp3; + struct rq *rq; + rq = cpu_rq(cpu); + bq = &rq->bq; + printk(KERN_ERR "Initializing bvt heaps for %d\n",cpu); + + bvt_sched_granularity = (suseconds_t) CONFIG_BVTPRD; + bvt_sched_unfthreshold = (suseconds_t) CONFIG_BVTPRD; + bvt_sched_tracing = 0; /* Off by default */ + ts_bvt_min_timeslice = ns_to_timespec(BVT_MIN_TIMESLICE * NSEC_PER_USEC); + /* BUG FIX: create heap might sleep, do alloc outside of lock + * region (Cannot sleep while holding a lock) */ + temp1 = create_heap(bvt_timeout_gt, BVT_HEAP_INIT_CAP); + temp2 = create_heap(coop_poll_timeout_gt, BVT_HEAP_INIT_CAP); + temp3 = create_heap(coop_poll_timeout_gt, BVT_HEAP_INIT_CAP); + /* XXX All these heap sizes should be fixed dynamically + * in the future, depending on total memory available */ + bq->bvt_heap = temp1; + bq->global_coop_deadline_heap = temp2; + bq->global_coop_sleep_heap = temp3; + + if (unlikely(!temp1 || !temp2 || !temp3)) { + /* memory allocation error */ + panic("unable to allocate memory for the bvt heap"); + + } + + bq->running_bvt_task = NULL; + bq->nr_running = 0; + bq->sum_weights = 0; + bq->bvt_timer_active = 0; + /* Initialize the per cpu bvt timer*/ + hrtimer_init(&bq->bvt_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bq->bvt_timer.function = handle_bvt_timeout; + /* Use the irqsafe_no_softirq callback mode*/ + bq->bvt_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + #ifdef CONFIG_SMP + /* Init the list used to iterate through all tasks under our scheduler */ + INIT_LIST_HEAD(&bq->bvt_list); + #endif + + /* initializing per cpu domain parameters */ + for(j=0;jbvt_domains[j].dom_type = j; + bq->bvt_domains[j].num_tasks = 0; + bq->bvt_domains[j].num_weights = 0; + memset(&bq->bvt_domains[j].dom_sched_param.bvt_virtual_time,0,sizeof(struct timespec)); + memset(&bq->bvt_domains[j].dom_sched_param.bvt_actual_time,0,sizeof(struct timespec)); + memset(&bq->bvt_domains[j].dom_sched_param.insertion_ts,0,sizeof(struct timespec));; + set_normalized_timespec(&bq->bvt_domains[j].dom_sched_param.fudge, 0, 0); + bq->bvt_domains[j].dom_sched_param.bheap_ptr = NULL; + bq->bvt_domains[j].dom_sched_param.dom_type = j; + } + + memset(&(bq->last_coop_deadline),0,sizeof(struct timeval)); + memset(&(bq->max_virtual_time),0,sizeof(struct timespec)); + memset(&(bq->tot_time),0,sizeof(struct timespec)); + bq->isTargetSet = -1; + bq->fudged_flag = false; + /*set_normalized_timespec(&ts_fudge,0,bvt_sched_granularity);*/ + set_normalized_timespec(&bq->ts_slack,0,COOP_DEAD_SLACK); + set_normalized_timespec(&bq->ts_io_lat,0,COOP_IO_LAT); + set_normalized_timespec(&bq->ts_emergency_timeslice,0,COOP_EMERGENCY_TIMESLICE); + bq->count = 0; + bq->adj=0; + bq->noadj=0; + bq->fudge=0; + bq->nofudge=0; + bq->bvtLogCount = 0; + + return; +} +/* bvt_global_init */ +#else +void coop_init(int cpu) {} +void bvt_global_init(int cpu) {} +#endif static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) { struct rt_prio_array *array; @@ -7857,7 +8062,7 @@ void __init sched_init(void) /* * During early bootup we pretend to be a normal task: */ - current->sched_class = &fair_sched_class; + current->sched_class = &default_sched_class; scheduler_running = 1; } diff --git a/kernel/sched_fairshare.c b/kernel/sched_fairshare.c new file mode 100644 index 0000000..4e05094 --- /dev/null +++ b/kernel/sched_fairshare.c @@ -0,0 +1,1585 @@ +/* This is the kernel bvt scheduler for cooperative processes + * It is hoped that the bvt scheduling with facilitate uniform sharing + * of cpu resources and reduce the need for cooperative applications to + * mutually trust each other. This is also the inherent mechanism + * for policing and scheduling best effort tasks. + * This is the kernel bvt scheduler for cooperative processes + * It is hoped that the bvt scheduling with facilitate uniform sharing + * of cpu resources and reduce the need for cooperative applications to + * mutually trust each other. This is also the inherent mechanism + * for policing and scheduling best effort tasks. + * + * High res timer incorporated: April 1 1, 2007. + * Details on high res timers is here: + * http://lwn.net/Articles/167897/ + * + * Profiling of bvt syscall per process incorporated: + * April 24, 2007. + * + * Experimental new heuristic that enforces the period + * more strongly merged: April 26 2007 + * + * Policing and coop-real time domains introduced: + * May 22, 2007. + * + * This is the joint work of all the following authors: + * Mayukh Saubhasik, mayukh@cs.ubc.ca + * Anirban Sinha, anirbans@cs.ubc.ca + * Charles 'Buck' Krasic, krasic@cs.ubc.ca + * Ashvin Goel, ashvin@eecg.toronto.edu + + * Note about timekeeping: + * kernel uses all different forms and notions about what monotonic "now" is. + * Please check fairshare_now() function for explanations + * on a few. However, our user level coop processes uses do_gettimeofday() + * system call to register their "now" value. This is actually the wall + * clock time. Thus, all coop related + * time comparisons (and while reporting time to userspace) + * uses do_gettimeofday() call to get the value of + * "now". Unfortunately, this value is not the same as the monotonic clock + * value (see wall_to_monotonic offset timespec) that is used by the + * timer code (highres timers or otherwise). + * Hence, we are forced to use two different notions of time, + * do_gettimeofday() and fairshare_now(). The former is used to report + * and compare time values in the coop heap. The later is used by the + * fairshare timer scheduling code. + */ + +#include +#include +#include +#include +#include +#ifdef CONFIG_HIGH_RES_TIMERS +#include +#endif +#include +#include +#include +#include +#include +#include + +/* Read only data*/ +/* This is the global bvt timeslice + * period in microseconds, can be asynchronously updated thru the proc filesystem*/ +volatile suseconds_t bvt_sched_granularity; +volatile unsigned int bvt_sched_tracing; +volatile suseconds_t bvt_sched_unfthreshold; +/*volatile struct timespec ts_fudge;*/ + +struct timespec ts_bvt_min_timeslice; + +/* End of read only global data */ + +#if defined(CONFIG_SCHED_COOPREALTIME) + +#define BVT_DEBUG_FLAG "[BVT]" + +#include +#define bvt_print_debug(fmt,arg...) do { \ + print_debug(BVT_DEBUG_FLAG, fmt,##arg); \ + }while(0); + +#define BVT_HEAP_INIT_CAP 1024 + +/* Forward declarations */ +static void bvt_borrow(struct rq *rq,struct task_struct *p,int wakeup); +static void dequeue_task_faircoop(struct rq *rq, struct task_struct *p,int sleep); +static void yield_task_faircoop(struct rq *rq); +static void check_preempt_faircoop(struct rq* rq, struct task_struct *p); +static struct task_struct* __sched pick_next_task_arm_timer(struct rq *rq); +static void __sched update_bvt_prev(struct rq *rq, struct task_struct *prev); +static void set_curr_task_faircoop(struct rq *rq); +static void task_tick_faircoop(struct rq *rq, struct task_struct *p, int queued); +static void task_new_faircoop(struct rq *rq, struct task_struct *p); +static void switched_from_faircoop(struct rq *this_rq, struct task_struct *task, int running); +static void switched_to_faircoop(struct rq *this_rq, struct task_struct *task, int running); +static void prio_changed_faircoop(struct rq* this_rq, struct task_struct *task, int oldprio, int running); + +void inline remove_task_from_bvt_queue(struct bvtqueue *bq, + struct task_struct *p); + +#ifdef CONFIG_SMP +static unsigned long +load_balance_faircoop(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio); + + +static int +move_one_task_faircoop(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle); + +static int select_task_rq_faircoop(struct task_struct *p, int sync); +static void join_domain_faircoop(struct rq* rq); +static void leave_domain_faircoop(struct rq* rq); +static unsigned long +load_balance_faircoop(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio); +#endif + +/* Scheduling class struct */ +const struct sched_class faircoop_sched_class = { + .next = &idle_sched_class, + .enqueue_task = bvt_borrow, + .dequeue_task = dequeue_task_faircoop, + .yield_task = yield_task_faircoop, + .check_preempt_curr = check_preempt_faircoop, + .pick_next_task = pick_next_task_arm_timer, + .put_prev_task = update_bvt_prev, + .set_curr_task = set_curr_task_faircoop, + .task_tick = task_tick_faircoop, + .task_new = task_new_faircoop, + +#ifdef CONFIG_SMP + .load_balance = load_balance_faircoop, + .move_one_task = move_one_task_faircoop, + .select_task_rq = select_task_rq_faircoop, +/* These seem to be optional*/ + #if 0 + .join_domain = join_domain_faircoop, + .leave_domain = leave_domain_faircoop, + #endif +#endif + + .prio_changed = prio_changed_faircoop, + .switched_to = switched_to_faircoop, + .switched_from = switched_from_faircoop +}; + + +inline struct bvtqueue* get_task_bq_locked(struct task_struct *tsk, + unsigned long *flags) +{ + struct rq *rqueue; + rqueue = task_rq_lock(tsk, flags); + return &rqueue->bq; +} + +inline void put_task_bq_locked(struct bvtqueue* bq, + unsigned long *flags) +{ + struct rq *rqueue = container_of(bq,struct rq,bq); + task_rq_unlock(rqueue,flags); +} + +inline struct bvtqueue* get_cpu_bq_locked(int cpu, unsigned long *flags) +{ + struct rq *rqueue; + local_irq_save(*flags); + rqueue = cpu_rq(cpu); + spin_lock_irq(&rqueue->lock); + return &rqueue->bq; +} + + +inline void put_cpu_bq_locked(int cpu, unsigned long *flags) +{ + + struct rq *rqueue = cpu_rq(cpu); + spin_unlock_irqrestore(&rqueue->lock, *flags); +} + +inline struct bvtqueue* cpu_bq(int cpu) +{ + struct rq *rqueue; + rqueue = cpu_rq(cpu); + return &rqueue->bq; +} + +inline struct bvtqueue* get_task_bq(struct task_struct *p) +{ + struct rq *rqueue = task_rq(p); + return &rqueue->bq; +} + +inline int is_best_effort(struct task_struct* p) +{ + if (!p->cf.bvt_dom) return 0; + return (task_domain(p) == DOM_BEST_EFFORT); +} + +/* Called with runQ lock held*/ +void bvt_timer_cancel(struct hrtimer* timer, struct rq* rq) +{ + if(hrtimer_active(timer)) + hrtimer_cancel(timer); + //rq->bq.bvt_timer_active = 0; +} + +void do_policing(struct bvtqueue *bq, struct task_struct *tsk); + +static inline struct task_struct* get_task(struct fairshare_sched_param* sp) +{ + struct bvt_struct *bs; + if (sp->dom_type == DOM_BEST_EFFORT) { + bs = container_of(sp,struct bvt_struct,private_sched_param); + return bs->me; + }else /* pointer to real coop sched_param. They are not specifically associated + * with any task */ + return NULL; +} + +/* gets the current kernel time using the monotonic clock. + * @ts: The timespect value that represents *current time* + */ +void fairshare_now(struct timespec *ts) { + /* There are several functions that returns the value of "now":- + * + * 1. current_kernel_time() simply reads xtime variable + * using the seq lock and returns the value directly. xtime + * is updated on every global timer interrupt for SMP machines + * with a global time source. It is not updated on the local + * APIC timer interrupt which keeps track of the process + * accounting and is often programmed to fire at HZ + * frequency. For UP machines, there is only one time source + * anyways, so it is updated before rest of the process time + * accounting code in the same handler. + * + * *ts = current_kernel_time(); + * + * 2. In NON-NUMA machines & machines where + * tsc is reliable, sched_clock() first does an rdtsc() to read the + * time stamp counter. It then converts that value to ns. The + * NUMA case is a special case where TSC is not synchronized + * across all cpus. In those cases and where tsc is unstable, + * sched_clock() uses the 64 bit + * jiffies along with HZ to arrive at a ns value. + * + * *ts = ns_to_timespec(sched_clock()); + * + * 3. ktime_get_ts: The difference between current_kernel_time() + * and the one below is that + * current_kernel_time() returns the xtime value directly + * whereas ktime_get_ts() does a interpolation correction + * on the top of xtime value to get the wall time. It then + * converts wall time to monotonic time. It is also the + * interface used by the highres timers. Hence I hope that + * use of this function will make timing accounting more + * fine grained. + */ + + ktime_get_ts(ts); + +} /* fairshare_now */ + +/* insert_task_into_bvt_queue: + * @t: the task_struct of the task + * @bq: pointer to the per cpu bvt queue + * @flag: whether we are allowed to grow the stack, = 0:no, = 1:yes + * and put into the queue + * The corresponding runqueue lock must be acquired before calling + * this function. + */ + +void insert_task_into_bvt_queue(struct bvtqueue *bq, + struct task_struct *t) +{ + g_assert(t); + g_assert(is_bvt(t)); + + if(!is_bvt(t)) return; + + fairshare_now(&t->cf.task_sched_param->insertion_ts); + //t->cf.task_sched_param->insertion_ts = ns_to_timespec(sched_clock()); + + if (t->cf.task_sched_param->bheap_ptr) { + spin_unlock(&(task_rq(t)->lock)); + printk(KERN_ERR "%d is overwriting an entry in the bvt heap\n",t->pid); + BUG_ON(1); + } + + t->cf.task_sched_param->bheap_ptr = + heap_insertt(bq->bvt_heap, + t->cf.task_sched_param, + t->cf.task_sched_param); + + if (!t->cf.task_sched_param->bheap_ptr) { + bvt_print_debug("heap_insert returned null:" + "Unable to insert proc node into bvt heap"); + panic("Unable to insert proc node into bvt heap"); + } +}/* insert_task_into_bvt_queue */ + + + +/* find_fairshare_period: + * finds the fairshare period for the current CPU + * @next: The task for whom we're calculating the period + * @return: fairshare period in nsecs + */ +static unsigned long find_fairshare_period(struct task_struct *next) +{ + struct bvtqueue *bq; + unsigned long fair_share_period = 0; + bq = cpu_bq(task_cpu(next)); + + if (bq->nr_running) { + /* Sanity check*/ + if (!bq->sum_weights) { + spin_unlock(&task_rq(next)->lock); + printk(KERN_ERR "Pid = %d %lu, sum_weights zero!\n",next->pid, + bq->nr_running); + BUG_ON(1); + } + + if (is_coop_realtime(next) && + (task_domain(next) != DOM_REALTIME_TEMP)) { + fair_share_period = bvt_sched_granularity * + bq->bvt_domains[task_domain(next)].num_weights; + } + else { + fair_share_period = bvt_sched_granularity * + next->se.load.weight; + } + + fair_share_period = (fair_share_period / bq->sum_weights); + } else { + /* Give up runQ lock*/ + spin_unlock(&(task_rq(next)->lock)); + printk (KERN_ERR "Pid = %d undefined fair share period: impossible, nrtasks = %lu\n", + (int)next->pid, + bq->nr_running); + printk(KERN_ERR "Flags = %d %d %d %d %d\n", + next->cf.bvt_t.pseudo_sleep, + is_coop_realtime(next),next->cf.bvt_t.about_to_sleep, + next->se.on_rq, + bq->bvt_heap->size); + BUG_ON(1); + } + + /* Sanity check */ + if (!fair_share_period) { + oops_in_progress = 1; + printk(KERN_ERR "Grrr %lu %lu %lu\n",bq->nr_running,bq->sum_weights,next->se.load.weight); + oops_in_progress = 0; + } + + return fair_share_period * NSEC_PER_USEC; +} +/* find_fairshare_period() */ + +/* calculate_bvt_period: + * This is the part where we find the timeslice based on the + * preferential treatment given to the coop processes + * @next: The next chosen bvt process, either real-coop or + * best effort. + */ +static void calculate_bvt_period(struct task_struct *next) +{ + struct timespec ts_coop_prd; + struct timespec ts_zero; + struct task_struct *next_coop_task; + unsigned long fair_share_period; + int period_set; + struct timespec period; + struct bvtqueue *bq; + + next_coop_task = NULL; + memset(&ts_zero,0,sizeof(struct timespec)); + bq = cpu_bq(task_cpu(next)); + + fair_share_period = find_fairshare_period(next); + + period = ns_to_timespec(fair_share_period); + + /* get the next most important deadline event from the + * current heap. Also get the time until the next + * coop event (timeout) + */ + period_set = find_coop_period(next, &next_coop_task, &ts_coop_prd); + + if (period_set) + period = (timespec_compare(&period, &ts_coop_prd) < 0)? period:ts_coop_prd; + + /* this is where we limit the number of context switches */ + bq->curr_bvt_period = + (timespec_compare(&period, &ts_bvt_min_timeslice) < 0)? + ts_bvt_min_timeslice : period; + +} /* calculate_bvt_period */ + +/* This is actually the borrowing part for interractive applications + * It is called from activate_task in sched.c + * The scheduling class enqueue function is supposed to set the + * on_rq flag + */ +static void bvt_borrow(struct rq *rq,struct task_struct *p,int wakeup) +{ + struct fairshare_sched_param *top_node = NULL; + struct timeval tv_zero = { .tv_sec = 0, .tv_usec = 0}; + struct bvtqueue* bq; + struct timeval deadline; + int ret; + unsigned int cp; + + if(!is_bvt(p)) { + spin_unlock(&rq->lock); + printk(KERN_ERR "Task being enqueued is not in faircoop scheduling class\n"); + BUG_ON(1); + return; + } + + bq = &rq->bq; + + #ifdef CONFIG_SMP + /* Add the task to the list */ + list_add(&p->cf.bvt_procs, &bq->bvt_list); + #endif + + /* Pseudo sleep special cases*/ + /* It just woke up from an IO call */ + if (is_coop_realtime(p) && (p->cf.bvt_t.pseudo_sleep ==1)) { +#if 0 + do_gettimeofday(&deadline); +#else + deadline.tv_sec = deadline.tv_usec = 0; +#endif + /* Bug Fix: Only replace current deadline, if current time is earlier than recorded deadline*/ + if(p->cf.coop_t.coop_deadline_heap_node && + (timeval_compare(&p->cf.coop_t.dead_p.t_deadline,&deadline) > 0)) { + remove_task_from_coop_queue(p, &(bq->cq[task_domain(p)]),1); + ret = insert_task_into_timeout_queue(&deadline,&(bq->cq[task_domain(p)]),p,1,1); + if (unlikely(ret < 0)){ + demote_task(bq,p,COOP_POLL_SLEEP); + } + } + p->se.on_rq = 1; + return; + } + else if (!is_coop_realtime(p) && (p->cf.bvt_t.pseudo_sleep == 1)) { + do_gettimeofday(&deadline); + set_tsk_as_temp_coop(p); + ret = insert_task_into_timeout_queue(&deadline,&(bq->cq[DOM_REALTIME_TEMP]),p,1,2); + if (unlikely(ret < 0)){ + demote_task(bq,p,COOP_POLL_SLEEP); + } + p->se.on_rq = 1; + return; + } + /* On pseudo sleep, wakeup path */ + else if ((p->cf.bvt_t.pseudo_sleep == 2)) { + p->se.on_rq = 1; + return; + } + /* End pseudo sleep special cases */ + + + /* handle real coop wakeups seperately here */ + if (is_coop_realtime(p) && !p->cf.bvt_t.about_to_sleep) { + + int dom_id = task_domain(p); + /* Increment domain count*/ + bq->bvt_domains[task_domain(p)].num_tasks++; + bq->bvt_domains[task_domain(p)].num_weights += p->se.load.weight; + /* reinsert the coop nodes into the coop heap */ + if (timeval_compare(&(p->cf.coop_t.dead_p.t_deadline), + &tv_zero) > 0) { + /* insert into timeout heap */ + insert_task_into_timeout_queue(NULL,&(bq->cq[dom_id]),p,0,3); + /* and remove from coop sleep queue */ + remove_task_from_coop_queue(p, &(bq->cq[dom_id]),3); + } + /* if there are other coops running, do not borrow and do not + * insert my nodes into the heap + */ + if (task_domain(p) != DOM_REALTIME_TEMP && + (bq->bvt_domains[task_domain(p)].num_tasks > 1)) { + return; + } + } /* if */ + /* Wake up from informed un-controlled sleep + Treat wakeups as coop poll deadlines*/ + else if (!is_coop_realtime(p) && (p->cf.bvt_t.about_to_sleep)) { + do_gettimeofday(&deadline); + set_tsk_as_temp_coop(p); + ret = insert_task_into_timeout_queue(&deadline,&(bq->cq[DOM_REALTIME_TEMP]),p,1,4); + if (unlikely(ret < 0)){ + demote_task(bq,p,COOP_POLL_SLEEP); + } + /* Increment domain count,after task has changed domains*/ + bq->bvt_domains[task_domain(p)].num_tasks++; + bq->bvt_domains[task_domain(p)].num_weights += p->se.load.weight; + } + else if (is_coop_realtime(p) && (p->cf.bvt_t.about_to_sleep)) { + do_gettimeofday(&deadline); + /* Increment domain count*/ + bq->bvt_domains[task_domain(p)].num_tasks++; + bq->bvt_domains[task_domain(p)].num_weights += p->se.load.weight; + /* Remove from coop sleep queue */ + remove_task_from_coop_queue(p, &(bq->cq[task_domain(p)]),3); + /* Bug Fix: Only replace current deadline, if current time is earlier than recorded deadline*/ + if ((timeval_compare(&p->cf.coop_t.dead_p.t_deadline,&deadline) > 0)) { + insert_task_into_timeout_queue(&deadline,&(bq->cq[task_domain(p)]),p,1,5); + } + else { + insert_task_into_timeout_queue(NULL,&(bq->cq[task_domain(p)]),p,0,6); + } + /* if there are other coops running, do not borrow and do not + * insert my nodes into the heap + */ + if (task_domain(p) != DOM_REALTIME_TEMP && + (bq->bvt_domains[task_domain(p)].num_tasks > 1)) { + return; + } + } + else if(!is_coop_realtime(p) && !p->cf.bvt_t.about_to_sleep) { + /* Best effort task*/ + bq->bvt_domains[task_domain(p)].num_tasks++; + bq->bvt_domains[task_domain(p)].num_weights += p->se.load.weight; + } + else { + spin_unlock(&rq->lock); + printk(KERN_ERR "Pid = %d, Enqueuing, unhandled case\n",p->pid); + BUG_ON(1); + } + + if (likely(!heap_is_empty(bq->bvt_heap))) { + top_node = (struct fairshare_sched_param*) heap_min_data(bq->bvt_heap); + } + + if (likely(!p->cf.task_sched_param->bheap_ptr)) { + /* Only borrow forward, borrowing backward allows a task to cheat on fairness */ + if (top_node) { + if( (timespec_compare(&top_node->bvt_virtual_time, &p->cf.task_sched_param->bvt_virtual_time) > 0) ) + p->cf.task_sched_param->bvt_virtual_time = top_node->bvt_virtual_time; + } + /* DO NOT reset the virtual time of the system, this again allows tasks to cheat, if they time it right */ + /* This will overflow in 136 years */ + else{ + p->cf.task_sched_param->bvt_virtual_time = bq->max_virtual_time; + } + /* Sanity check*/ + if(timespec_compare(&(p->cf.task_sched_param->bvt_virtual_time), &(bq->max_virtual_time)) > 0) { + spin_unlock(&rq->lock); + printk(KERN_ERR "%d %d Enq has virt time greater than max %u.%ld %u.%ld\n",p->pid,task_cpu(p),(unsigned)p->cf.task_sched_param->bvt_virtual_time.tv_sec,p->cf.task_sched_param->bvt_virtual_time.tv_nsec,(unsigned)bq->max_virtual_time.tv_sec,bq->max_virtual_time.tv_nsec); + if (top_node) + printk(KERN_ERR "Min %u.%ld\n",(unsigned)top_node->bvt_virtual_time.tv_sec,top_node->bvt_virtual_time.tv_nsec); + for_each_online_cpu(cp) { + print_heap(cpu_bq(cp)->bvt_heap); + print_heap(cpu_bq(cp)->bvt_heap); + } + BUG_ON(1); + } + + /* insert task into the bvt queue*/ + insert_task_into_bvt_queue(bq,p); + } + + /* Needed by other parts of the scheduler*/ + p->se.on_rq = 1; +} +/* bvt_borrow */ + +/* update_virtual_times: Update the virtual times + * of the best effort and the real-coop guys + * This function is called with the runQ lock held + */ +static void update_virtual_times(struct task_struct *p) +{ + struct timespec ts_delta; + long nr_realcoop = 0; + suseconds_t nsec; + cputime_t cputime; + unsigned long delta_exec; + //struct cpu_usage_stat *cpustat; + + struct bvtqueue *bq = cpu_bq(task_cpu(p)); + + p->cf.bvt_t.bvt_timeslice_end = cpu_bq(task_cpu(p))->ts_now; + //p->cf.bvt_t.bvt_timeslice_end = ns_to_timespec(task_rq(p)->clock); + + ts_delta = timespec_sub(p->cf.bvt_t.bvt_timeslice_end, p->cf.bvt_t.bvt_timeslice_start); + + /* Stats */ + + /* Update the actual time, before incorporating domain details */ + set_normalized_timespec(&p->cf.bvt_t.private_sched_param.bvt_actual_time, + p->cf.bvt_t.private_sched_param.bvt_actual_time.tv_sec + ts_delta.tv_sec, + p->cf.bvt_t.private_sched_param.bvt_actual_time.tv_nsec + ts_delta.tv_nsec); + + set_normalized_timespec(&bq->tot_time,bq->tot_time.tv_sec + ts_delta.tv_sec,bq->tot_time.tv_nsec + ts_delta.tv_nsec); + + /* Convert actual time to jiffies using HZ, and override utime value*/ + //p->utime = timespec_to_cputime(&p->cf.bvt_t.private_sched_param.bvt_actual_time); + //p->utimescaled = cputime_to_scaled(timespec_to_cputime(&p->cf.bvt_t.private_sched_param.bvt_actual_time)); + + /* Update cfs stats */ + delta_exec = (unsigned long)(task_rq(p)->clock - p->se.exec_start); + p->se.sum_exec_runtime += delta_exec; + + //cputime = timespec_to_cputime(&bq->tot_time); + //cpustat = &kstat_this_cpu.cpustat; + /* Convert total time to cputime. */ + //tmp = cputime_to_cputime64(cputime); + //cpustat->user = tmp; + + /* Stats end */ + + if (is_coop_realtime(p) && task_domain(p) != DOM_REALTIME_TEMP) + nr_realcoop = bq->bvt_domains[task_domain(p)].num_tasks; + + /* note: nr_realcoop will be 0 only when a single real-coop + * guy is running in the system in this domain and is going into a + * cooperative sleep. + */ + + if (is_coop_realtime(p) && (nr_realcoop > 1) ) { + + nsec = (suseconds_t) timespec_to_ns(&ts_delta); + /* Sanity check*/ + if (bq->bvt_domains[task_domain(p)].num_weights <= 0) { + spin_unlock(&task_rq(p)->lock); + printk(KERN_ERR "Weights empty %d %lu\n",p->pid,bq->bvt_domains[task_domain(p)].num_weights); + BUG_ON(1); + } + nsec = nsec / (bq->bvt_domains[task_domain(p)].num_weights); + ts_delta = ns_to_timespec(nsec); + } + else { + nsec = (suseconds_t) timespec_to_ns(&ts_delta); + nsec = nsec / p->se.load.weight; + ts_delta = ns_to_timespec(nsec); + } + + set_normalized_timespec(&p->cf.task_sched_param->bvt_virtual_time, + p->cf.task_sched_param->bvt_virtual_time.tv_sec + ts_delta.tv_sec, + p->cf.task_sched_param->bvt_virtual_time.tv_nsec + ts_delta.tv_nsec); + + /* Update max vt */ + if(timespec_compare(&(p->cf.task_sched_param->bvt_virtual_time), &(bq->max_virtual_time)) > 0) { + bq->max_virtual_time = p->cf.task_sched_param->bvt_virtual_time; + } +} +/* update_virtual_times */ + +/* handle_bvt_timeout: handle bvt timeout for a particular bvt process + * The goal of the function is to call schedule() and chhose a new + * process to run when the old bvt process has run out of its fair + * share. + */ + +#ifdef CONFIG_HIGH_RES_TIMERS +static enum hrtimer_restart handle_bvt_timeout(struct hrtimer *timer) +{ + struct task_struct *p; + struct rq *rq; + struct bvtqueue *bq = container_of(timer,struct bvtqueue,bvt_timer); + rq = container_of(bq,struct rq,bq); + + p = bq->running_bvt_task; + + #ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "Ding\n"); + oops_in_progress = 0; + #endif + + /* Police the guy, for not yielding on time*/ + if (is_coop_realtime(p)) { + /* there is a case of policing here. We need to + * decrement the number of coop tasks and remove this + * tasks's node from the coop heap + */ + + #ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d dem timeslice\n",p->pid); + oops_in_progress = 0; + #endif + spin_lock(&rq->lock); + demote_task(bq,p,COOP_POLL_LATE_YIELD); + spin_unlock(&rq->lock); + } /* if */ + + //bq->bvt_timer_active = 0; + spin_lock(&rq->lock); + resched_task(p); + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; + +} /* handle_bvt_timeout */ + +#else +#endif + +/* The following function schedules a dynamic + * timer to fire after the + * bvt interval expires for the process p. It schedules the bvt task to + * preempt after that interval. + * Must be called with the task bvt queue lock held + * @bq: The bvt queue for the process p, assumed that the corresponding + * runqueue lock is acquired before calling this function. + * @p: the process for which the timer is scheduled. + * The timer interval is obtained from the global value + * curr_bvt_period + */ +#ifdef CONFIG_HIGH_RES_TIMERS +void schedule_dynamic_bvt_timer(struct bvtqueue *bq, + struct task_struct *p) +{ + struct timespec ts; + struct timespec deadline; + //struct timespec ts_now; + //ts_now = ns_to_timespec(task_rq(p)->clock); + + ts = bq->curr_bvt_period; + /* now() + ts = deadline */ + set_normalized_timespec(&deadline,ts.tv_sec + bq->ts_now.tv_sec, + ts.tv_nsec + bq->ts_now.tv_nsec); + + /* Set the exposed deadline,before incorporating slack */ + if (bq->fudged_flag) { + /* Set exposed deadline as now, to force + * the coop guy to yield as soon as possible */ + p->cf.coop_t.deadline = bq->ts_now; + } + else { + p->cf.coop_t.deadline = deadline; + } + + if (is_coop_realtime(p)) { + /* add slack value for only coop realtime tasks so that they + * do not get policed if they yield within this slack interval + */ + set_normalized_timespec(&deadline,deadline.tv_sec + bq->ts_slack.tv_sec, + deadline.tv_nsec + bq->ts_slack.tv_nsec); + } + /* if */ + /* Initialize the per cpu bvt timer*/ + //hrtimer_init(&bq->bvt_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bq->bvt_timer.expires = timespec_to_ktime(deadline); + //bq->bvt_timer.function = handle_bvt_timeout; + /* Use the irqsafe_no_softirq callback mode*/ + //bq->bvt_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer_start(&bq->bvt_timer, bq->bvt_timer.expires,HRTIMER_MODE_ABS); + //bq->bvt_timer_active = 1; + +} /* schedule_dynamic_bvt_timer */ + +#else /* !CONFIG_BVT_HIGHRES_TIMER: use regular timer wheel */ +#endif /* CONFIG_BVT_HIGHRES_TIMER */ + +/* charge_running_times: + * Update the running time of a task safely + * @bq: the pointer to the per cpu bvt structure in the + * runqueue + * @prev: The pointer to the task struct whose time we + * want to update. + */ +static inline void charge_running_times(struct bvtqueue *bq, + struct task_struct *prev) +{ + int was_in_heap; + + was_in_heap = (int)prev->cf.task_sched_param->bheap_ptr; + + if (was_in_heap) { + remove_task_from_bvt_queue(bq,prev); + } + + update_virtual_times(prev); + + if(was_in_heap) { + insert_task_into_bvt_queue(bq,prev); + } +} +/* charge_running_times */ + +/* Called with runQ lock held */ +inline void tv_fairshare_now_adjusted(struct timeval *tv) +{ + struct timespec ts_now_adjusted; + //struct timespec ts_now; + //ts_now = ns_to_timespec(task_rq(current)->clock); + set_normalized_timespec(&ts_now_adjusted, + cpu_bq(smp_processor_id())->ts_now.tv_sec - wall_to_monotonic.tv_sec, + cpu_bq(smp_processor_id())->ts_now.tv_nsec - wall_to_monotonic.tv_nsec); + + tv->tv_sec = ts_now_adjusted.tv_sec; + tv->tv_usec = ts_now_adjusted.tv_nsec / NSEC_PER_USEC; +} + +/* choose_next_bvt: + * Picks the next most eligible guy to run, and also updates upcoming deadline + * Upcoming deadline is used to calculate the timeslice + */ +static struct task_struct* __sched choose_next_bvt(struct bvtqueue *bq) +{ + struct fairshare_sched_param *new_node = NULL; + struct task_struct *next_coop_task = NULL; + struct task_struct *next_overall_coop_task = NULL; + struct task_struct *next_earliest_deadline_task = NULL; + struct timeval tv_now; + struct timespec *next_virtual_time, *next_coop_virtual_time, *min_virt_time; + struct timespec ts_diff; + struct timespec ts_zero; + struct timespec ts_fudge; + struct task_struct *next = NULL; + set_normalized_timespec(&ts_zero, 0 , 0); + + /* Reset the fudged flag + * The fudged flag stays on cross execution contexts and gets reset here */ + bq->fudged_flag = false; + + if (likely(!heap_is_empty(bq->bvt_heap))) + new_node = (struct fairshare_sched_param*) heap_min_data(bq->bvt_heap); + else + return NULL; + + if (new_node->dom_type == DOM_BEST_EFFORT) + { + next = get_task(new_node); + }else { + + /* our coop algorithm runs on top of best effort. So we need to + * override the fairshare heuristics with our coop heuristics here + * We play a trick here, since there are basically one bvt domain + * and multiple coop domains, if the node is not a bvt node, + * it must be one of the coop nodes and the dom_type is actually + * represents the dom_id. + */ + choose_next_coop(&next_coop_task, new_node->dom_type); + + if (next_coop_task) + { + next = next_coop_task; + } else { + spin_unlock(&(container_of(bq,struct rq, bq)->lock)); + printk(KERN_ERR "oops, this was not anticipated!\n"); + BUG_ON(1); + } /* if */ + } /* if */ + + + /* Store the actual value for the min_virt_time for the system, before incorporating slack*/ + + min_virt_time = &(next->cf.task_sched_param->bvt_virtual_time); + find_nearest_global_deadline(&next_earliest_deadline_task); + set_normalized_timespec(&ts_fudge,0,bvt_sched_unfthreshold*NSEC_PER_USEC); + + /* Returns the wall time */ + tv_fairshare_now_adjusted(&tv_now); + + /* If the earliest deadline task has an expired deadline, choose it else choose the highest priority asap task + * The next_overall_coop_task is 'the' most eligible guy to run in the sytem now, in terms of timeliness + */ + + if (next_earliest_deadline_task) + { + /* Check if this deadline is expired or not */ + if(timeval_compare(&(next_earliest_deadline_task->cf.coop_t.dead_p.t_deadline), + &tv_now) < 0) + { + next_overall_coop_task = next_earliest_deadline_task; + } + else { + next_overall_coop_task = NULL; + } + } + else next_overall_coop_task = NULL; + + /* Fudging Logic*/ + if (next_overall_coop_task && ((next) != next_overall_coop_task)) { + + next_coop_virtual_time = &next_overall_coop_task->cf.task_sched_param->bvt_virtual_time; + next_virtual_time = &((next)->cf.task_sched_param->bvt_virtual_time); + + if (timespec_compare(next_coop_virtual_time, next_virtual_time) == 0 ) { + /* Fudge in this case, if ts_fudge is greater than zero */ + if (timespec_compare(&ts_fudge, &ts_zero) > 0 ) { + bq->fudge++; + next = next_overall_coop_task; + bq->fudged_flag = true; + + } + } + else if (timespec_compare(next_coop_virtual_time, next_virtual_time) > 0) { + ts_diff = timespec_sub(*next_coop_virtual_time, *next_virtual_time); + + /* Fudge if diff lesser than ts_fudge */ + if (timespec_compare(&ts_diff, &ts_fudge) <= 0) { + bq->fudge++; + next = next_overall_coop_task; + bq->fudged_flag = true; + } else { + /* Don't fudge*/ + bq->nofudge++; + #ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d dem overran thresh\n",next_overall_coop_task->pid); + oops_in_progress = 0; + #endif + /* Police the task for burning its temporary unfairness threshold*/ + demote_task(bq,next_overall_coop_task,COOP_POLL_VIRTUAL_TIME); + } /* else */ + + bq->count++; + + + } /* if */ + + + } /* if */ + + + /* Sanity Checks */ + if (timespec_compare(&bq->max_virtual_time, min_virt_time) < 0) { + spin_unlock(&(container_of(bq,struct rq, bq)->lock)); + printk(KERN_ERR "All hell hath break loose, max virt time less than min\n"); + printk(KERN_ERR "%u.%ld, %u.%ld\n",(unsigned)bq->max_virtual_time.tv_sec, + bq->max_virtual_time.tv_nsec,(unsigned)min_virt_time->tv_sec, + min_virt_time->tv_nsec); + BUG_ON(1); + } + + if ( ((next)->cf.task_sched_param->bvt_virtual_time.tv_sec < 0) || ((next)->cf.task_sched_param->bvt_virtual_time.tv_nsec < 0) ) { + spin_unlock(&(container_of(bq,struct rq, bq)->lock)); + printk(KERN_ERR "Virt time overflowed\n"); + BUG_ON(1); + } + + if (next->cf.bvt_t.pseudo_sleep) { + /* Task in psuedo sleep, has to be 'woken' up*/ + next->cf.bvt_t.pseudo_sleep = 2; + if (!next->se.on_rq) + activate_task(task_rq(next),next,1); + /* Use the psuedo_sleep flag to break out of the select call */ + } + + return next; +} +/* choose_next_bvt */ + +/* update_bvt_prev: This function does bvt updates on the bvt process + * we are context switching from @bq: the pointer to the per cpu bvt + * structure in the runqueue @prev: the pointer to the task_struct of + * the process we want to context switch from. + */ +static void __sched update_bvt_prev(struct rq *rq, struct task_struct *prev) +{ + struct bvtqueue *bq = &rq->bq; + fairshare_now(&bq->ts_now); + + if(!is_bvt(prev)) + return; + + /* A coop realtime guy is not allowed to yield the cpu without + * having any deadline info in the heaps*/ + if (prev->se.on_rq && + is_coop_realtime(prev) && + !prev->cf.coop_t.coop_deadline_heap_node && + !prev->cf.coop_t.coop_asap_heap_node) { + demote_task(bq,prev,COOP_POLL_BEST_EFFORT); +#ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d dem wrong yield\n",prev->pid); + oops_in_progress = 0; +#endif + } + + charge_running_times(bq,prev); + + +} +/* update_bvt_prev */ + + + +/* prepare_bvt_context_switch: + * handle context switch to a bvt task + * This is called from within schedule() + * Be careful: this function is called from within + * a critical section and thus irq's and preemptions are + * disabled. + * It is extremely important to optimize the code path + * in this function. + * Further, the corresponding runqueue for bq must be locked + * before calling this function. + * Assumption: next is in the runqueue + * of which bq is a member. This assumption is + * true when the function is called from schedule() + * @return 0: Don't arm timeout timer + * 1: Arm timeout timer + */ + +static struct task_struct* __sched pick_next_task_arm_timer(struct rq *rq) +{ + struct bvtqueue *bq = &rq->bq; + struct task_struct *next = NULL; + int was_in_heap; + struct timespec timer; + + if (unlikely(heap_is_empty(bq->bvt_heap))) { + return NULL; + } + + next = choose_next_bvt(bq); + if (NULL == next) + return NULL; + + bq->running_bvt_task = next; + next->cf.bvt_t.bvt_timeslice_start = bq->ts_now; + + /* BUG FIX: Remove this guy's entry from the timeout heap, + * since he is about to run + * Prevents the task from taking its own deadline + * into consideration while calculating + * the timeslice value*/ + was_in_heap = (int)next->cf.coop_t.coop_deadline_heap_node; + if (is_coop_realtime(next) && was_in_heap) { + remove_task_from_coop_queue(next,&(bq->cq[task_domain(next)]),1); + } + + calculate_bvt_period(next); + + /* Re-insert user level deadline onto the heap */ + if (is_coop_realtime(next) && was_in_heap) { + insert_task_into_timeout_queue(NULL,&(bq->cq[task_domain(next)]),next,0,7); + } + + /* Arm the timer now */ + schedule_dynamic_bvt_timer(&rq->bq,next); + + /* Update exec_start, to enable cfs process accounting */ + next->se.exec_start = rq->clock; + + /* Coop realtime temp tasks get automatically demoted*/ + if (task_domain(next) == DOM_REALTIME_TEMP) { + demote_task(bq,next,COOP_POLL_BEST_EFFORT); + } + + #ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d %u.%u %d %u.%u\n",next->pid,bq->curr_bvt_period.tv_sec,bq->curr_bvt_period.tv_nsec, + bq->fudged_flag,next->cf.task_sched_param->bvt_virtual_time.tv_sec,next->cf.task_sched_param->bvt_virtual_time.tv_nsec); + oops_in_progress = 0; + #endif + + return next; + +} +/* prepare_bvt_context_switch*/ + +void inline remove_task_from_bvt_queue(struct bvtqueue *bq, + struct task_struct *p) +{ + if (p->cf.task_sched_param->bheap_ptr) { + heap_delete(bq->bvt_heap,p->cf.task_sched_param->bheap_ptr); + p->cf.task_sched_param->bheap_ptr = NULL; + } + else { + spin_unlock(&(task_rq(p)->lock)); + printk(KERN_ERR "Task %d being del has no backpointer\n",p->pid); + BUG_ON(1); + } +} +/* remove_task_from_bvt_queue*/ + +/* set_tsk_as_besteffort: marks a task as a best effort task only + * called from one place: __do_set_bvt(); @tsk: the task_struct for + * the task + */ +void init_bvt_domain(struct bvtqueue *bq, + struct task_struct* tsk) +{ + /* multiple calls to this function should not + * mess with our accounting logic + */ + if (tsk->cf.bvt_dom == &(bq->bvt_domains[DOM_BEST_EFFORT])) + return; + + /* register this task as belonging to the + * best effort domain + */ + tsk->cf.bvt_dom = &(bq->bvt_domains[DOM_BEST_EFFORT]); + tsk->cf.dom_id = DOM_BEST_EFFORT; + + /* task virtual time is going to be the virtual time of + * the individual task + */ + tsk->cf.task_sched_param = &tsk->cf.bvt_t.private_sched_param; +} +/* set_tsk_as_besteffort */ + +/* do_policing: This is the main policing function. Important: It + * only goes one way, i.e., demotes a real coop task to a best effort + * task. So this function is only called for a real coop task and not + * a best effort task. The assertion at the very beginning is for the + * same purpose. + */ +void do_policing(struct bvtqueue *bq, struct task_struct *tsk) +{ + g_assert(tsk); + g_assert(bq); + g_assert(is_coop_realtime(tsk)); + + /* register this task as belonging to the + * best effort domain + */ + tsk->cf.bvt_dom = &(bq->bvt_domains[DOM_BEST_EFFORT]); + tsk->cf.dom_id = DOM_BEST_EFFORT; + + /* task virtual time is going to be the virtual time of the + * individual task,update pvt vt to be the domain's vt at this point + */ + tsk->cf.bvt_t.private_sched_param.bvt_virtual_time = tsk->cf.task_sched_param->bvt_virtual_time; + /* BUG FIX: Reset the heap backpointer*/ + tsk->cf.bvt_t.private_sched_param.bheap_ptr = NULL; + tsk->cf.task_sched_param = &tsk->cf.bvt_t.private_sched_param; + + #if defined(CONFIG_SMP) + unpin_coop_task(tsk); + #endif + clear_coop_task(tsk); + +} /* do_policing */ + +void demote_task(struct bvtqueue *bq, struct task_struct *p, enum COOP_POLL_STATUS reason) +{ + int dom_id = task_domain(p); + + if (!is_coop_realtime(p)) + return; + + + remove_task_from_coop_queue(p,&(bq->cq[task_domain(p)]),0); + + /* Bug Fix: p->state is unreliable, running tasks may have state + * as sleeping. Use on_rq flag instead*/ + if (p->se.on_rq || p->cf.bvt_t.pseudo_sleep) { + bq->bvt_domains[dom_id].num_tasks--; + bq->bvt_domains[dom_id].num_weights -= p->se.load.weight; + + if (bq->bvt_domains[task_domain(p)].num_tasks < 0 ) { + spin_unlock(&(container_of(bq,struct rq, bq)->lock)); + printk(KERN_ERR "Number of task went under \n"); + BUG_ON(1); + } + + if(task_domain(p) == DOM_REALTIME_TEMP) + remove_task_from_bvt_queue(bq,p); + else if(bq->bvt_domains[task_domain(p)].num_tasks <= 0) + remove_task_from_bvt_queue(bq,p); + } + + bq->reason = reason; + bq->adj++; + bq->noadj=p->pid; + do_policing(bq,p); + + /* Bug Fix: p->state is unreliable, running tasks may have state + * as sleeping*/ + if (p->se.on_rq || p->cf.bvt_t.pseudo_sleep) { + /* insert task back into heap */ + insert_task_into_bvt_queue(bq,p); + bq->bvt_domains[DOM_BEST_EFFORT].num_tasks++; + bq->bvt_domains[DOM_BEST_EFFORT].num_weights += p->se.load.weight; + } + +} +/* bvt_timeout_gt: The comparison function that compares two nodes in + * the bvt heap. The first comparison parameter is the virtual + * time. The node having the smaller virtual time gets to run next. + * If the two nodes have the same virtual time, then I use a second + * key to sort the nodes, their preempt time. + */ + +static gboolean bvt_timeout_gt(heap_key_t a, heap_key_t b) +{ + struct fairshare_sched_param *node1 = a; + struct fairshare_sched_param *node2 = b; + struct timespec vt1,vt2; + + vt1 = (node1->bvt_virtual_time); + vt2 = (node2->bvt_virtual_time); + + if (!timespec_compare(&vt1, &vt2)) { + + /* FIFO ordering, LIFO ordering results in a live deadlock with ping ponging tasks */ + return (timespec_compare(&node1->insertion_ts, + &node2->insertion_ts)>0); + } + else + { + return(timespec_compare(&vt1,&vt2) > 0); + } +} +/* bvt_timeout_gt */ + + +/* bvt_proc_init: + * initializes the process specific data structures for + * the bvt kernel process scheduler + */ +inline void bvt_proc_init(struct task_struct *p) +{ + memset(&p->cf.bvt_t,0, sizeof(struct bvt_struct)); + memset(&p->cf.bvt_t.private_sched_param, + 0, + sizeof(struct fairshare_sched_param)); + + p->cf.bvt_t.private_sched_param.dom_type = DOM_BEST_EFFORT; + p->cf.bvt_dom = NULL; + + p->cf.bvt_t.me = p; + p->cf.dom_id = -1; + INIT_LIST_HEAD(&p->cf.bvt_procs); +} +/* bvt_proc_init*/ + +static int show_bvtstat(struct seq_file *seq, void *v) { + int cpu; + seq_printf(seq, "timestamp %lu\n", jiffies); + seq_printf(seq, "global bvt period = %ld000 nsec\n", (long) bvt_sched_granularity); + + for_each_online_cpu(cpu) { + struct bvtqueue *bq; + bq = cpu_bq(cpu); + seq_printf(seq, "current bvt period (cpu %d) = %ld nsec\n", + cpu,(long) timespec_to_ns(&bq->curr_bvt_period)); + seq_printf(seq, "Policed: %ld\n", bq->adj); + seq_printf(seq, "Last policed pid = %lu\n",bq->noadj); + seq_printf(seq, "fudge: %ld\n", bq->fudge); + seq_printf(seq, "nofudge: %ld\n", bq->nofudge); + seq_printf(seq, "Current fudge threshold in nsecs = %ld000\n", bvt_sched_unfthreshold); + } + + seq_printf(seq, "minimum bvt period = %ld usec\n", (long) BVT_MIN_TIMESLICE); + return 0; +} + +void detach_coop_fairshare_sched(struct task_struct* tsk) +{ + if (!is_bvt(tsk)) return; +} + +static int bvtstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + + char *buf; + + struct seq_file *m; + int res; + + /* don't ask for more than the kmalloc() max size, currently 128 KB */ + if (size > 128 * 1024) + size = 128 * 1024; + + buf = kmalloc(size, GFP_KERNEL); + + if (!buf) + return -ENOMEM; + res = single_open(file, show_bvtstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} /* bvtstat_open */ + +struct file_operations proc_bvtstat_operations = { + .open = bvtstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void task_new_faircoop(struct rq *rq, struct task_struct *p) +{ +bvt_borrow(rq,p,0); +} + +static void dequeue_task_faircoop(struct rq *rq, struct task_struct *p,int sleep) +{ + + struct bvtqueue *bq = &rq->bq; + + if(!is_bvt(p)) + return; + + #ifdef CONFIG_SMP + /* Remove the task from the per cpu task list*/ + list_del(&p->cf.bvt_procs); + #endif + + /* Pseudo sleep special cases*/ + /* Don't change our pvt data structures in case of pseudo sleep*/ + if (p->cf.bvt_t.pseudo_sleep) { + p->se.on_rq = 0; + return; + } + /* End of pseudo sleep special cases*/ + + /* Decrement the domain count*/ + bq->bvt_domains[task_domain(p)].num_tasks--; + bq->bvt_domains[task_domain(p)].num_weights -= p->se.load.weight; + if (bq->bvt_domains[task_domain(p)].num_tasks < 0) { + spin_unlock(&rq->lock); + printk(KERN_ERR "Number of tasks went under %d %d\n",p->pid, + task_domain(p)); + BUG_ON(1); + } + + + if (is_coop_realtime(p)) + { + /* note that policing has not yet taken place, so no + * matter whether we are cooperatively sleeping or + * not, we remove our nodes from coop heap because the task + * is getting deactivated. + */ + remove_task_from_coop_queue(p,&(bq->cq[task_domain(p)]),0); + + if(task_domain(p) == DOM_REALTIME_TEMP) + remove_task_from_bvt_queue(bq,p); + else if(bq->bvt_domains[task_domain(p)].num_tasks <= 0) + remove_task_from_bvt_queue(bq,p); + + /* Check for policing*/ + if(!p->cf.coop_t.is_well_behaved) { + bq->reason = COOP_POLL_SLEEP; + bq->adj++; + bq->noadj=p->pid; + #ifdef DEBUG_FAIRCOOP + oops_in_progress = 1; + printk(KERN_ERR "%d dem bad sleep\n",p->pid); + oops_in_progress = 0; + #endif + do_policing(bq,p); + } + else { + insert_task_into_sleep_queue(NULL,&(bq->cq[task_domain(p)]),p,0); + } /* else */ + } else { + remove_task_from_bvt_queue(bq,p); + } /* else */ + + p->se.on_rq = 0; +} + +#ifdef CONFIG_SMP +static struct task_struct *load_balance_next_faircoop(void *arg) +{ + struct list_head *curr_pos; + struct coop_fairshare_struct *curr_cf; + struct bvtqueue *bq = (struct bvtqueue*)arg; + + curr_pos = bq->list_pos; + bq->list_pos = curr_pos->next; + if(curr_pos != &bq->bvt_list) { + curr_cf = list_entry(curr_pos, struct coop_fairshare_struct,bvt_procs); + return container_of(curr_cf,struct task_struct, cf); + } + else + return NULL; +} + +static struct task_struct *load_balance_start_faircoop(void *arg) +{ + struct bvtqueue *bq = (struct bvtqueue*)arg; + /* Initialize the iterator */ +#if 0 + bq->list_pos = &(bq->bvt_list.next); +#else + bq->list_pos = bq->bvt_list.next; +#endif + return NULL; +} + + +static unsigned long +load_balance_faircoop(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + struct rq_iterator faircoop_rq_iterator; + long rem_load_move = max_load_move; + + faircoop_rq_iterator.start = load_balance_start_faircoop; + faircoop_rq_iterator.next = load_balance_next_faircoop; + faircoop_rq_iterator.arg = (void*)&(busiest->bq); + + rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, + &faircoop_rq_iterator); + + return max_load_move - rem_load_move; +} + +static int +move_one_task_faircoop(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct rq_iterator faircoop_rq_iterator; + faircoop_rq_iterator.start = load_balance_start_faircoop; + faircoop_rq_iterator.next = load_balance_next_faircoop; + faircoop_rq_iterator.arg = (void*)&(busiest->bq); + + if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, &faircoop_rq_iterator)) + return 1; + else + return 0; +} + +static int select_task_rq_faircoop(struct task_struct *p, int sync) +{ + /* No passive load balancing for now*/ + return task_cpu(p); +} + +static void join_domain_faircoop(struct rq* rq) +{ +} + +static void leave_domain_faircoop(struct rq* rq) +{ +} +#endif + +static void task_tick_faircoop(struct rq *rq, struct task_struct *p, int queued) +{ +} + +static void set_curr_task_faircoop(struct rq *rq) +{ + fairshare_now(&rq->curr->cf.bvt_t.bvt_timeslice_start); + rq->curr->se.exec_start = rq->clock; + //rq->curr->cf.bvt_t.bvt_timeslice_start = ns_to_timespec(sched_clock()); + resched_task(rq->curr); +} + +static void yield_task_faircoop(struct rq *rq) +{ + /* Dequeue, and then enqueue the task + * BUG FIX: Put on sleeping flag, a coop poll guy yielding the cpu + * outside of coop poll should be policed*/ + struct task_struct* temp; + temp = rq->curr; + dequeue_task_faircoop(rq,temp,1); + bvt_borrow(rq,temp,1); +} + +/* Called with runQ lock held */ +static void check_preempt_faircoop(struct rq* rq, struct task_struct *p) +{ + struct timespec ts_delta; + struct timespec ts_slack_io_lat; + struct timespec now; + struct timespec ts_zero; + struct timespec io_lat; + struct timespec ts_new_deadline; + struct timeval tv_now; + + memset(&ts_zero,0,sizeof(struct timespec)); + /* Preempt the current task if its running in 'Fair execution mode' + * Check if there is a task which just woke up and needs to be run */ + + /* Sanity check, don't try to re-schedule one's own self*/ + if (p == rq->curr) { + return; + } + + /* Allow realtime tasks to unconditionally preempt you*/ + if (unlikely(rt_prio(p->prio))) { + resched_task(rq->curr); + return; + } + + if (is_coop_realtime(p) && + (p->cf.bvt_t.about_to_sleep || p->cf.bvt_t.pseudo_sleep)) { + + + #ifdef DEBUG_FAIRCOOP + do_gettimeofday(&tv_now); + oops_in_progress = 1; + printk(KERN_ERR "[%u.%u] %d %d preempt\n",tv_now.tv_sec,tv_now.tv_usec,rq->curr->pid,p->pid); + oops_in_progress = 0; + #endif + + fairshare_now(&now); + + if (!is_coop_realtime(rq->curr)) { + /* Even for non cooprealtime tasks, allow them to run for the minimum timeslice first*/ + ts_delta = timespec_sub(now, p->cf.bvt_t.bvt_timeslice_start); + if (timespec_compare(&ts_delta,&ts_bvt_min_timeslice) > 0) + resched_task(rq->curr); + else { + /* Reprogram the timer to fire once ts_bvt_min_timeslice is done*/ + ts_delta = timespec_sub(ts_bvt_min_timeslice,ts_delta); + bvt_timer_cancel(&rq->bq.bvt_timer,rq); + set_normalized_timespec(&ts_delta,ts_delta.tv_sec + now.tv_sec, ts_delta.tv_nsec + now.tv_nsec); + rq->bq.bvt_timer.expires = timespec_to_ktime(ts_delta); + hrtimer_start(&rq->bq.bvt_timer,rq->bq.bvt_timer.expires,HRTIMER_MODE_ABS); + rq->curr->cf.coop_t.deadline = ts_delta; + } + } + else { + /* Do not directly preempt coop realtime tasks, give them a chance to voluntarily yield the cpu + Calculate, how long till the the bvt timer fires*/ + ts_delta = timespec_sub(ktime_to_timespec(rq->bq.bvt_timer.expires),now); + set_normalized_timespec(&ts_slack_io_lat, rq->bq.ts_io_lat.tv_sec, rq->bq.ts_io_lat.tv_nsec + COOP_DEAD_SLACK); + /* Re-program the timer, if ts_delta > io_lat*/ + if (timespec_compare(&ts_delta, &ts_slack_io_lat) > 0 ) { + #ifdef DEBUG_FAIRCOOP + do_gettimeofday(&tv_now); + oops_in_progress = 1; + printk(KERN_ERR "[%u.%u] Reprog timer %d %d\n",tv_now.tv_sec,tv_now.tv_usec,rq->curr->pid,p->pid); + oops_in_progress = 0; + #endif + + bvt_timer_cancel(&rq->bq.bvt_timer,rq); + set_normalized_timespec(&ts_slack_io_lat, ts_slack_io_lat.tv_sec + now.tv_sec, ts_slack_io_lat.tv_nsec + now.tv_nsec); + rq->bq.bvt_timer.expires = timespec_to_ktime(ts_slack_io_lat); + /* Use the irqsafe_no_softirq callback mode*/ + hrtimer_start(&rq->bq.bvt_timer,rq->bq.bvt_timer.expires,HRTIMER_MODE_ABS); + //rq->bq.bvt_timer_active = 1; + /* Reprogram the recorded deadline also*/ + set_normalized_timespec(&ts_new_deadline,rq->bq.ts_io_lat.tv_sec + now.tv_sec, rq->bq.ts_io_lat.tv_nsec + now.tv_nsec); + //if (timespec_compare(&ts_new_deadline, &rq->curr->cf.coop_t.deadline) < 0) + rq->curr->cf.coop_t.deadline = ts_new_deadline; + + /* This will cause current to yield in its next call to coop poll */ + rq->bq.rendezvous = 0; + /* stats */ + /* Waker records task that was + running when IO caused + another task to wakeup */ + rq->bq.cq[task_domain(rq->curr)].num_rendezvous_waker++; + /* Wakee is the task that + waked due to IO, which will + cause the next rendezvous + to yield */ + rq->bq.cq[task_domain(p)].num_rendezvous_wakee++; + } + } + } +} + +static void prio_changed_faircoop(struct rq* this_rq, struct task_struct *task, int oldprio, int running) +{ +} + + +static void switched_from_faircoop(struct rq *this_rq, struct task_struct *task, int running) +{ +} + +static void switched_to_faircoop(struct rq *this_rq, struct task_struct *task, int running) +{ +} + + +#endif + + diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0f3c191..4abae1a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1319,7 +1319,11 @@ static void set_curr_task_rt(struct rq *rq) } static const struct sched_class rt_sched_class = { +#if defined(CONFIG_SCHED_COOPREALTIME) + .next =&faircoop_sched_class, +#else .next = &fair_sched_class, +#endif .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2911665..f5a39a6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -229,7 +229,44 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ #endif +#if defined(CONFIG_SCHED_COOPREALTIME) +static suseconds_t min_bvt_sched_granularity = 100; /* 100 usecs */ +static suseconds_t max_bvt_sched_granularity = 1000000; /* 1 second */ +#endif + static struct ctl_table kern_table[] = { +#if defined(CONFIG_SCHED_COOPREALTIME) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "bvt_sched_period_us", + .data = &bvt_sched_granularity, + .maxlen = sizeof(suseconds_t), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_bvt_sched_granularity, + .extra2 = &max_bvt_sched_granularity, + }, + { + .ctl_name = KERN_SCHED_TRACING, + .procname = "bvt_sched_tracing", + .data = &bvt_sched_tracing, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "bvt_sched_unfairness_us", + .data = &bvt_sched_unfthreshold, + .maxlen = sizeof(suseconds_t), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_bvt_sched_granularity, + .extra2 = &max_bvt_sched_granularity, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, @@ -299,14 +336,7 @@ static struct ctl_table kern_table[] = { #endif { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, .proc_handler = &sched_rt_handler, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_runtime_us", .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), diff --git a/lib/Makefile b/lib/Makefile index 74b0cfb..967ab57 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for some libs needed in the kernel. # -lib-y := ctype.o string.o vsprintf.o cmdline.o \ +lib-y := heap.o ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ diff --git a/lib/heap.c b/lib/heap.c new file mode 100644 index 0000000..3bd8ac0 --- /dev/null +++ b/lib/heap.c @@ -0,0 +1,464 @@ +/* Nov 2006: This is a generic heap library has been ported from the userlevel + * heap implementation in Qstream (http://www.qstream.org) + * Parts of the code uses userlevel glib library keywords that has + * been ported to traditional C syntax using glib.h definitions + * - Mayukh Saubhasik, mayukh@cs.ubc.ca + * - Anirban Sinha, anirbans@cs.ubc.ca + * - Charles Krasic, krasic@cs.ubc.ca + * + * Mar 2007: We make modifications so that: + * Now we have two variations of heap_insert() function. + * + * (a) normal heap_insert(): checks if heap has enough space + * and then grows heap if required provided we are not in atomic() or + * interrupt context. If we are in interrupt or atomic region,it panics! + * + * (b) heap_insert_nogrow(): checks to see if heap has enough space + * and then panics if the space is insufficient. Does not attempt to + * grow the heap in any situation. + * + * In both functions, if heap has enogh space, all memory + * allocations for heap is done during heap initialization, it is no + * longer required to allocate memory for heap node at every call. + * This makes the heap + * implementations scale to situations when heap_insert() is + * called from within atomic blocks. Since we prefer not to use per-CPU + * page cache for heap memory allocations, this is important. + * In atomic sections, essentially we would call the "nogrow" version of + * the heap_insert() function call. + * Similarly heap nodes are not actually deleted at every call to + * heap_delete(). The nodes are deleted once and for all when + * heap_destroy() is called. + * It is hoped that these reduced memory operations will make the + * heap implementation more efficient within the kernel. + * - Ani, anirbans@cs.ubc.ca + */ + +#include +#include +#include +#include + +#define HEAP_EXPANSION_FACTOR 2 + +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +//static inline void heap_is_correct(heap_t *heap) __attribute__((always_inline)); + +#if defined(FAIRCOOP_HEAP_DEBUG) + +#define heap_is_correct(heap) do { \ + gint __i; \ + if(bvt_sched_tracing == 3 && ( !irqs_disabled()) ) {\ + printk(KERN_ERR "Preemption/Irq enabled at %s line %d, \ + irq = %d, preempt = %d\n", __func__, __LINE__, irqs_disabled(), preempt_count()); \ + BUG(); \ + } \ + for(__i=1;__i<=heap->size;__i++) \ + if(!(heap->nodes[__i]->key)) { \ + printk(KERN_ERR "Heap got fucked at %s line %d, index with null entry = %d,heap size = %d\n",__func__,__LINE__, __i,heap->size); \ + BUG(); \ + } \ + } while(0) + +#else +#define heap_is_correct(heap) do { \ + gint i; \ + if(( !irqs_disabled()) ) {\ + printk(KERN_ERR "Preemption/Irq enabled at %s line %d, \ + irq = %d, preempt = %d\n", __func__, __LINE__, irqs_disabled(), preempt_count()); \ + BUG(); \ + } \ + for (i = 1; (i * 2 + 1) < heap->size; i++) { \ + if (heap->key_comp(heap->nodes[i]->key, heap->nodes[i * 2]->key) \ + || heap->key_comp(heap->nodes[i]->key, \ + heap->nodes[i * 2 + 1]->key)) { \ + printk(KERN_ERR "%s: heap has a problem at i = %d %d %d\n", \ + __func__, i,heap->size,smp_processor_id()); \ + printk(KERN_ERR "%u.%u\n",((struct fairshare_sched_param*)(heap->nodes[i]->key))->bvt_virtual_time.tv_sec,((struct fairshare_sched_param*)(heap->nodes[i]->key))->bvt_virtual_time.tv_nsec); \ + print_heap(heap); \ + BUG(); \ + } /* for */ \ + } /* for */ \ + } while(0) +/* heap_is_correct */ +#endif + +static inline struct task_struct* get_task(struct fairshare_sched_param* sp) +{ + struct bvt_struct *bs; + if (sp->dom_type == DOM_BEST_EFFORT) { + bs = container_of(sp,struct bvt_struct,private_sched_param); + return bs->me; + }else /* pointer to real coop sched_param. They are not specifically associated + * with any task */ + return NULL; +} + +void print_heap(heap_t *heap) +{ + gint j; + struct task_struct *tsk; + for(j=1;j<=heap->size;j++) { + tsk = get_task(heap->nodes[j]->key); + if (tsk) + printk(KERN_ERR "%d %d\n",tsk->pid,task_cpu(tsk)); + else + printk(KERN_ERR "Coop\n"); + printk(KERN_ERR "%d %u.%u %u.%u\n",j,((struct fairshare_sched_param*)(heap->nodes[j]->key))->bvt_virtual_time.tv_sec,((struct fairshare_sched_param*)(heap->nodes[j]->key))->bvt_virtual_time.tv_nsec,((struct fairshare_sched_param*)(heap->nodes[j]->key))->insertion_ts.tv_sec,((struct fairshare_sched_param*)(heap->nodes[j]->key))->insertion_ts.tv_nsec); + } +} + +/* Allocate new heap node */ +static inline heap_node *new_heap_node(heap_t *heap, heap_key_t key, heap_data_t data, gint index) +{ + /* The heap pointer is really not needed in this version of the code + * but is intentionally kept as a parameter so that later we may + * be able to optimise the memory allocation by allocating a chunk + * of memory using the slab allocator and using smaller chunks from + * there as and when new memory allocation is needed for the nodes. + */ + + heap_node *node; + + node = g_new_atomic(sizeof(heap_node) * 1); + + /* on failure to allocate memory for the new node */ + if (!node) { + /* handle memory allocation error in the caller */ + return NULL; + } + + node->key = key; + node->data = data; + node->index = index; + return(node); +} /* new_heap_node */ + + +/* Deallocate memory for the new node */ + +static inline void free_heap_node(heap_t *heap, heap_node *node) +{ + + /* The heap pointer is really not needed in this version of the code + * but is intentionally kept as a parameter so that later we may + * be able to optimise the memory allocation by allocating a chunk + * of memory using the slab allocator and using smaller chunks from + * there as and when new memory allocation is needed for the nodes. + */ + + g_assert(node); + g_free(node); + +} /* free_heap_node */ + +/* Make the heap bigger */ + +static inline int grow_heap(heap_t *heap, gint capacity) +{ + int i; + + g_assert(heap); + + if(heap->capacity > capacity) return 0; + + /* Note: I have modified the original g_realloc prototype. + * It now takes the old allocated size as a parameter. + * Is there a way to get the amount of memory allocated to the old + * pointer automatically? + */ + + heap->nodes = g_realloc(heap->nodes, (heap->capacity + 1) * sizeof(*heap->nodes), + (capacity + 1) * sizeof(*heap->nodes)); + + if(unlikely(!heap->nodes)) return 1; + + + for(i=heap->capacity+1; i<=capacity;i++) { + heap->nodes[i] = new_heap_node(heap, NULL, NULL, i); + if(!heap->nodes[i]) return 1; + } + + heap->capacity = capacity; + return 0; + +} /* grow_heap */ + +/* Create a new heap */ + +heap_t * create_heap(heap_key_comp_func key_comp, gint initial_capacity) +{ + int i; + heap_t *heap; + + g_assert(key_comp); + + heap = g_new0_atomic(heap_t, 1); + + if(likely(heap)) { + g_assert(key_comp); + heap->key_comp = key_comp; + heap->capacity = MAX(1,initial_capacity); + heap->size = 0; + + /* reserve one extra memory for the sentinel node */ + heap->nodes = g_new_atomic(sizeof (heap_node*) * (initial_capacity + 1)); + + if(unlikely (!heap->nodes)) return NULL; /* memory allocation failure */ + + for(i=1;i<=initial_capacity; i++) + { + heap->nodes[i] = new_heap_node(heap, NULL, NULL, i); + if (unlikely(!heap->nodes[i])) return NULL; + } + + return(heap); + } + else /* memory allocation failed */ + return NULL; /* It is the responsibility of the caller to + * take care of the allocation failure + */ + +} /* create_heap */ + +int heap_ensure_capacity(heap_t *heap, gint capacity) +{ + if (heap->capacity >= capacity) { + return 0; + } /* if */ + + return grow_heap(heap, capacity); +} /* heap_ensure_capacity */ + +/* Cannot use this, as this may call realloc, which may sleep, This code is run in interrupt context */ +#if 0 +heap_node *heap_insertt(heap_t *heap, heap_key_t key, heap_data_t data) +{ + gint i; + heap_node* node; + + heap_is_correct(heap); + + if (heap->size == heap->capacity) { + + if (unlikely (grow_heap(heap, heap->capacity * HEAP_EXPANSION_FACTOR))) + return NULL; + /* Error: memory allocation failure while expanding heap */ + + } /* if */ + + heap->size++; + + node = heap->nodes[heap->size]; + + for (i = heap->size; + i > 1 && heap->key_comp(heap->nodes[i/2]->key, key); + i /= 2) { + heap->nodes[i] = heap->nodes[i / 2]; + heap->nodes[i]->index = i; + + } /* for */ + + /* Notice: No memory allocations !!! */ + + heap->nodes[i] = node; + heap->nodes[i]->key = key; + heap->nodes[i]->data = data; + heap->nodes[i]->index = i; + + heap_is_correct(heap); + + return(heap->nodes[i]); + + +} /* heap_insert */ +#endif + +heap_node *heap_insertt(heap_t *heap, heap_key_t key, heap_data_t data) +{ + gint i; + heap_node* node; + + g_assert(heap); + g_assert(key); + g_assert(data); + + heap_is_correct(heap); + + if (heap->size == heap->capacity) { + printk(KERN_ERR "insufficient space in heap"); + BUG(); + + } /* if */ + + heap->size++; + + node = heap->nodes[heap->size]; + + for (i = heap->size; + i > 1 && heap->key_comp(heap->nodes[i/2]->key, key); + i /= 2) { + heap->nodes[i] = heap->nodes[i / 2]; + heap->nodes[i]->index = i; + + } /* for */ + + /* Notice: No memory allocations !!! */ + + heap->nodes[i] = node; + heap->nodes[i]->key = key; + heap->nodes[i]->data = data; + heap->nodes[i]->index = i; + + heap_is_correct(heap); + + return(heap->nodes[i]); + + +} /* heap_insert_nogrow */ + + +/* Depending on whether the list is sorted in + * ascending or descending order, this function + * deletes minimum or maximum key. The key deleted is always + * the first key in the heap + */ + +heap_data_t heap_delete_min(heap_t *heap) +{ + heap_data_t result; + + if (heap_is_empty(heap)) { + return NULL; + } /* if */ + + result = heap->nodes[1]->data; + + heap_delete(heap, heap->nodes[1]); + + return(result); +} /* heap_delete_min */ + + +/* The following function does not necessarily return the + * smaller of the two children! It depends on the user of + * the function whether the smaller or the larger of the + * two children is obtained based on his implementation + * of the key_comp function. + * If key_comp(key1,key2)=true when key1 > key2 and + * =false otherwise + * the function actually returns the smaller of the two + * children. + * Otheriwise it returns the larger of the two children. + */ + +static __inline__ gint heap_smaller_child(heap_t *heap, gint i) +{ + gint child; + + child = i * 2; + if((child < heap->size) && + heap->key_comp(heap->nodes[child]->key, + heap->nodes[child + 1]->key) ) { + child++; + } /* if */ + return(child); +} +/* heap_smaller_child */ + +/* heap_delete: + * This function deletes an arbitrary node from + * the heap. + * Be careful about the polarity of the comparison + * function if you are trying to follow the algorithm. + */ + +void heap_delete(heap_t *heap, heap_node *node) +{ + gint i; + gint child; + gint parent; + heap_node *replacement; + heap_key_t key; + + g_assert(node); + g_assert(heap); + + heap_is_correct(heap); + + if (node->index > heap->size || node->index < 1) { + printk(KERN_ERR "Incorrect index for heap node %d %d\n",heap->size,node->index); + BUG(); + } + + /* Replace the deleted value with whatever was last */ + replacement = heap->nodes[heap->size]; + key = replacement->key; + + i = node->index; + + /* Replace the node with the previous last value, but find a + * spot that maintains the heap ordering rules. Only one of + * the following loops will execute (if at all). */ + + parent = i/2; + + while((i > 1) && + heap->key_comp(heap->nodes[parent]->key, key)) { + + /* we need to demote some ancestors */ + + heap->nodes[i] = heap->nodes[parent]; + heap->nodes[i]->index = i; + i = parent; + parent /= 2; + } /* while */ + + /* Choose the smaller child */ + child = heap_smaller_child(heap, i); + + while((child <= heap->size) && + heap->key_comp(key, heap->nodes[child]->key)) { + + /* promote the smallest decendants */ + + heap->nodes[i] = heap->nodes[child]; + heap->nodes[i]->index = i; + i = child; + child = heap_smaller_child(heap, i); + } /* while */ + + heap->nodes[i] = replacement; + heap->nodes[i]->index = i; + + heap->size--; + + /* Instead of deallocating memory, we simply mark the + * old memory as available and update pointers + */ + + node->key = NULL; + node->data = NULL; + node->index = heap->size +1; + heap->nodes[heap->size+1] = node; + heap_is_correct(heap); + +} /* heap_delete */ + +void +destroy_heap(heap_t *heap) +{ + gint i; + + for (i = heap->capacity; i > 0; i--) { + free_heap_node(heap, heap->nodes[i]); + } /* for */ + + g_free(heap->nodes); + g_free(heap); +} /* destroy_heap */ + +