/*
 *  arch/i386/kernel/mxt.c
 *
 *  Memory eXpansion Technology (MXT) support module.
 *
 *  MXT is a hardware for doubling the effective size 
 *  of the Linux memory: 256MB becomes 512MB, 1GB becomes 2GB, etc...
 *  MXT is transparent to all hardware and software including the
 *  Linux kernel, drivers, apps, peripherals etc etc.
 *
 *  MXT is implemented in ServerWorks Inc. Pinnacle memory controller
 *  chipset.  This modules manages the MXT memory.
 *  For detailed documentation see 
 *  http://oss.software.ibm.com/developerworks/opensource/mxt
 * 
 *  Copyright (C) 1999,2000,2001 IBM <mxt@us.ibm.com>
 *
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  version 2, or later. See /usr/src/linux/COPYING for more detail.
 *  This program is distributed WITHOUT any warranty, merchantability 
 *  or fitness for a particular purpose. See /usr/src/linux/COPYING
 *  for licensing details
 *
 *  Developers are welcome to comment and contribute to this project.  
 *  Please send email to <mxt@us.ibm.com> or visit the project web site
 *  http://oss.software.ibm.com/developerworks/opensource/mxt
 *
 *  Authors: 
 *  Bulent Abali <abali@us.ibm.com>
 *  Hubertus Franke <frankeh@us.ibm.com>
 */

/*  
 *  generic compression routines are usually prefixed with cmp_
 *  MXT hardware specific routines are usually prefixed with mxt_
 *
 *  Contents:
 *  ---------
 *  Section 1.  Performance counter support
 *  Section 2.  /proc/sys/mxt support
 *  Section 3.  Compression hardware specific routines
 *  Section 4.  MXT fast page operations
 *  Section 5.  Interrupt handling
 *  Section 6.  Generic memory compression support routines
 *  Section 7.  CPU grabbers help reduce compression pressure
 *  Section 8.  Timer routine calls compression manager periodically
 *  Section 9.  Page clearing threads reduce compression pressure
 *  Section 10. Scheduling priority of big memory tasks reduced
 *  Section 11. Kernel pages backed in the compressed memory to cover 
 *              the worst case compressibility.
 *  Section 100. Module load/unload
 *  
 */ 

#include <linux/config.h>
#include <linux/version.h>
#include <linux/module.h>
#include <linux/init.h>

#include <asm/uaccess.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/swap.h>
#include <asm/mxt.h>
#include <asm/system.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/threads.h>
#include <linux/interrupt.h>

#define _ABS(x)   (((x)>0)?(x):-(x))
#define _MIN(x,y) (((x)>(y))?(y):(x))
#define _MAX(x,y) (((x)>(y))?(x):(y))

#define CONFIG_MXT_HIDDEN_PAGES

static int pages_min;  /*  pages_min sum in all zones; page_alloc.c */
static int pages_low;
static int pages_high;

/* extern unsigned long totalram_pages; in linux/arch/i386/mm/init.c */ 
static unsigned long totalram_pages;
/* extern unsigned long totalhigh_pages; in linux/arch/i386/mm/init.c */ 
static unsigned long totalhigh_pages;
extern void si_meminfo(struct sysinfo *val);

/*  
 * release < acquire < danger < int < panic is necessary
 * Resetting max periodically to zero will not hurt. 
 * It tells you the highest utilization you have reached since 
 * boot time or last writing of max with 0.
 */
static struct _thresholds {
	unsigned long max;       /* maximum we have ever reached */
	unsigned long release;   /* start releasing pages */
	unsigned long acquire;   /* recall pages left in system */
	unsigned long danger;    /* do not return any pages */
	unsigned long intr;       /* interrupt notification occurs */
	unsigned long panic;     /* panic() */
} mc_th;

unsigned long max_lockedpages;
long swap_reserve=0;
long kernel_reserve=0;
unsigned long mxt_printk=0; /* != 0 prints debug information */

/* __alloc_clear uses if(1) mxt_clear_page() else 
   if(0) clear_highpage() */
static int fclr = 0;  

/* number of set-aside pages needed  */
volatile long nr_rsrv_pages = 0;
volatile long nr_pages_per_thread = 0;

/* and currently held by eater threads  */
static atomic_t total_pages_held  __cacheline_aligned = { 0 }; 
DECLARE_WAIT_QUEUE_HEAD(cmp_eatmem_wait);

/* pages hidden to back kernel pages in physical memory */
atomic_t nr_hide_pages __cacheline_aligned = { 0 }; 
atomic_t nr_hidden_pages __cacheline_aligned = { 0 };
DECLARE_WAIT_QUEUE_HEAD(cmp_hide_pages_wait);

static inline unsigned long get_memutil(void);

/* cmp_idle threads */
static int  cmp_idle(void *);
static void cmp_grab_cpus(void);
static void cmp_release_cpus(void);
DECLARE_WAIT_QUEUE_HEAD(cmp_idle_wait);

/* main routine which determines whether grab pages or not */
int memcompress_check(void);

/* reclaim and zero thread */
static void wake_up_cmp_eatmem(void);
static inline long mc_decayed_release(long pages);

/* statistics */
static struct _mxt_stats {
	volatile unsigned long interrupt;
	volatile unsigned long state;
	volatile unsigned long bumped;
	volatile unsigned long held;
	volatile unsigned long esr;
	volatile unsigned long esr_sticky;
} mxt_stats;

/* tuning knobs */
static struct _mxt_tuning {
	long period;       /* Poll period in jiffies; min=1 max=HZ;
			      Do not set it outside the range.
			      1 slowest polling, HZ fastest polling */
	long high_decay;   /* highest memory release rate(KB per period)*/
	long low_decay;    /* lowest memory release rate (KB per period */
	long delay;        /* wait in seconds before releasing pages */
	long bump_rate;    /* arm the bumper if physmem util is 
			      increasing faster than this per period */
	long spin_period;  /* spin period in jiffies */
} tuning = { 
	10,
	1000,                /* e.g. 1000 KB per 100 ms */
	100,                 /* e.g. 100 KB per 100 ms   */
	10,                  /* 10 seconds */
	30000,               /* e.g. 20,000 KB per 100 ms */
	5                    /* e.g. 5 jiffies or 50 milliseconds */
};

/* -------------------------------------------------------------- */

/*
 *  Real and Physical memory usage exposed by /proc/sys/mxt/cmpmemi
 *  See Documentation/mxt.txt for how to use
 */
struct cmpmem_info_t cmpmem_info;

/* --------------------- Section 1 ------------------------------ */

/*
 *  Pinnacle chip performance counter support
 *  Performance counters exposed from /proc/sys/mxt/cmpperf
 *  See Documentation/mxt.txt for how to use
 */
struct cmpperf_info_t perf_counters;
unsigned long perf_counters_request;  /* ICR mode */
int perf_counters_state;              /* started or stopped */

static unsigned long prev_ira;
static unsigned long prev_irb;

/*
 *  Reads counters from hardware and copies to soft registers
 */
inline void update_from_perf_counters(void)
{
	unsigned long n;
	
	n = READ_CTRL(IRA); 
	perf_counters.ira += n;

	n = READ_CTRL(IRB); 
	perf_counters.irb += n;

	/* following creates two 64 bit soft registers */
	perf_counters.irax += (perf_counters.ira < prev_ira ) ? 1 : 0; 
	prev_ira = perf_counters.ira;
	perf_counters.irbx += (perf_counters.irb < prev_irb ) ? 1 : 0;
 	prev_irb = perf_counters.irb;
	
	/* IRB consists of two 16-bit halves for some ICR modes. */
	/* The following may be meaningless in those modes       */
	/* but we will do and display the calculation in         */
	/* the file /proc/sys/mxt/cmpperf anyway                 */

	perf_counters.irbl += (0xffff & n);
	perf_counters.irbh += (0xffff & (n>>16));
}

/*
 *  This is called periodically to read hardware counters and copy
 *  them to 64-bit soft registers.  It also processes
 *  user commands entered from /proc/sys/mxt/icr (start|stop counting)
 */
inline void perf_counters_check(void)
{
	unsigned long n;

	if ( perf_counters_request ) { /* start request */

		if ( perf_counters_state == 0 ) { /* stopped state */

			/* reset the counters first */
			WRITE_CTRL( ICR, 0 );
			/* Reading while ICR=0 should clear the reg */
			n = READ_CTRL( IRA ); 
			n = READ_CTRL( IRB );
			WRITE_CTRL( IRA, 0 );
			WRITE_CTRL( IRB, 0 );

			/* clear the soft registers */
			perf_counters.mode=perf_counters_request;
			perf_counters.ira=0;
			perf_counters.irb=0;
			perf_counters.irbl=0;
			perf_counters.irbh=0;
			perf_counters.irax =0;
			perf_counters.irbx = 0;
			prev_ira=0;
			prev_irb=0;

			/* put hardware in counting mode */
			WRITE_CTRL( ICR, perf_counters_request );
			perf_counters_state = 1;

		} else { /* in started state */
			update_from_perf_counters();
		}

	} else {  /* stop request */
		
		if (  perf_counters_state  ) { /* started state */
			
			/* reset the counters first */
			WRITE_CTRL( ICR, 0 );
			/* Reading while ICR=0 should clear the reg */
			n = READ_CTRL( IRA );
			n = READ_CTRL( IRB );
			WRITE_CTRL( IRA, 0 );
			WRITE_CTRL( IRB, 0 );

			/* switch to stopped state */
			perf_counters_state = 0;  

			/* clear the soft registers */
			perf_counters.mode=0;
			perf_counters.ira=0;
			perf_counters.irb=0;
			perf_counters.irbl=0;
			perf_counters.irbh=0;
			perf_counters.irax=0;
			perf_counters.irbx=0;
			prev_ira=0;
			prev_irb=0;
		}
	}
}

/* --------------------- Section 2 ------------------------------ */

/*
 *   /proc/sys/mxt support
 */
enum {
	MXT_ICR=1,
	MXT_SWAP_RSRV=7,     /* amount of phys mem reserved in swap sp */
	MXT_KERN_RSRV=8,     /* amount of phys mem reserved in kern sp */
	MXT_DEBUG=9,         /* turn on/off repeating printk() */
	MXT_CMPPERF=10,
	MXT_CMPMEMI=11,
	MXT_STATS=12,
	MXT_THRESHOLDS=13,
	MXT_TUNING=14,
	MXT_FCLR=15
};

void get_cmpmem_info(struct cmpmem_info_t *si)
{
	si->memfree  = nr_free_pages();
	si->usedmem  = get_used_pages();
	si->physused = katsina_phys_used();
	si->util     = get_memutil();
	si->cmphold  = nr_rsrv_pages;
	si->cmpheld  = atomic_read(&total_pages_held);
	si->cmphide  = atomic_read(&nr_hide_pages);
	si->cmphidden  = atomic_read(&nr_hidden_pages);
}

/* 
 * I need a wrapper because the requested data requires making 
 * few function calls as shown in get_cmpmem_info()
 */
static int wrap1_proc_dointvec(ctl_table *table, 
			       int write, struct file *filp, 
			       void *buffer, size_t *lenp)
{
	if(table->data==&cmpmem_info && table->maxlen==sizeof(cmpmem_info))
		get_cmpmem_info(&cmpmem_info);

	return proc_dointvec(table,write,filp,buffer,lenp);
}

/* 
 * A user writing to /proc/sys/mxt/thresholds
 * must cause a write to the actual hardware register SUTLR.
 * This wrapper performs the task needed.
 */
static int wrap2_proc_dointvec(ctl_table *table, 
			       int write, struct file *filp,
			       void *buffer, size_t *lenp)
{
	int status = proc_dointvec(table,write,filp,buffer,lenp);

	/* at this point memory copy of mc_th.intr 
	   has been updated by user */

	if ( !write ) 
		return status;

	if ( mc_th.intr > 1000 ) {
		panic("<MC>: illegal value in /proc/sys/mxt/thresholds");
		return status;
	}

	if ( !status && table->data==&mc_th ) {
		/* this will write that value to the 
		   hardware register */
		unsigned long th = memutil_to_sectors( mc_th.intr );
		WRITE_CTRL(SUTLR, th);
		printk("<MC>: Set SUTLR=%08lx\n", th);
	}
	else {
		printk("<MC>: error: /proc/sys/mxt/thresholds\n");
	}
	
	return status;
}

/* 
 * I need a wrapper because the requested data requires making 
 * few function calls
 *
 * CAVEAT: you must periodically read /proc/sys/mxt/cmpperf
 * so that the hardware counters on the memory controller are copied
 * to the 64-bit soft registers.  Otherwise 32-bit hardware counters may 
 * roll over and you will get incorrect readings.
 * Likewise if you set /proc/sys/mxt/icr you must read this file once so 
 * that the value you wrote in icr is copied to the actual hardware 
 * register
 */
static int wrap3_proc_dointvec(ctl_table *table, 
			       int write, struct file *filp,
			       void *buffer, size_t *lenp)
{
	if(table->data==&perf_counters && !write && 
	   table->maxlen==sizeof(perf_counters))
		perf_counters_check();

	return proc_dointvec(table,write,filp,buffer,lenp);
}


static struct ctl_table_header *mxt_table_header;

static ctl_table mxt_table[] = {
	{ MXT_ICR, "icr",
	  &perf_counters_request, sizeof(perf_counters_request), 
	  0644, NULL, &proc_dointvec },
	{ MXT_THRESHOLDS, "thresholds",
	  &mc_th, sizeof(mc_th), 
	  0644, NULL, &wrap2_proc_dointvec },
	{ MXT_SWAP_RSRV, "swap_rsrv",
	  &swap_reserve, sizeof(swap_reserve), 
	  0644, NULL, &proc_dointvec },
	{ MXT_KERN_RSRV, "kernel_rsrv",
	  &kernel_reserve, sizeof(kernel_reserve), 
	  0644, NULL, &proc_dointvec },
	{ MXT_DEBUG, "debug",
	  &mxt_printk, sizeof(mxt_printk), 
	  0644, NULL, &proc_dointvec },
	{ MXT_CMPPERF, "cmpperf", 
	  &perf_counters, sizeof(perf_counters), 
	  0444,NULL, &wrap3_proc_dointvec },
	{ MXT_CMPMEMI, "cmpmemi", 
	  &cmpmem_info, sizeof(cmpmem_info), 
	  0444,NULL, &wrap1_proc_dointvec },
	{ MXT_STATS, "stats", 
	  &mxt_stats, sizeof(mxt_stats), 
	  0644,NULL, &proc_dointvec },
	{ MXT_TUNING, "tuning", 
	  &tuning, sizeof(tuning), 
	  0644,NULL, &proc_dointvec },
	{ MXT_FCLR, "fclr", 
	  &fclr, sizeof(fclr), 
	  0644,NULL, &proc_dointvec },
	{0}
};

static ctl_table mxt_root_table[] = {
	{254, "mxt", NULL, 0, 0555, mxt_table},
	{0}
};

void mxt_register_sysctl(void)
{
        mxt_table_header = register_sysctl_table(mxt_root_table, 0);
}

void mxt_unregister_sysctl(void)
{
	unregister_sysctl_table(mxt_table_header);
}

/* --------------------- Section 3 ------------------------------ */

/* 
 *   Compression hardware specific stuff
 */

#include <linux/mm.h>
#include <linux/version.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/smp_lock.h>
#include <linux/tqueue.h>
#include <asm/page.h>
#include <asm/fixmap.h>
#include <asm/mxt.h>
#include <asm/uaccess.h>
// #include <asm/spinlock.h> 
#include <asm/processor.h>
#include <asm/io.h>
#include <linux/pci.h>
#include <linux/init.h>

int mxt_bios_found;
int mxt_memory_expanded;
int mxt_compression_enabled;
int mxt_device_found;
int mxt_irq;

/*
 * compression ratio becomes 1:1 we may need to swap pages out.
 * Thus, swap_reserve is the amount reserved in the
 * swap space for compression purposes. Swap_reserve must be equal to the 
 * committed but missing amount of memory, which is the real minus physical
 * amount of memory.  For example
 * we told kernel that it has 1 GB of memory but infact it has only 512MB 
 * in the box. Since real=2*physical, the swap_reserve amount must be equal
 * to the physical amount of memory (that is amount of installed memory in
 * the box)  This value may be overridden from /proc/sys/mxt.
 * 
 * Your total vm size is roughly ram size + swap size - swap_reserve
 * One side effect of swap_reserve is that if you turn off swap or do not 
 * have large enough swap processes cannot allocate more memory than
 * ram size - swap_reserve.  
 *
 * When swap_reserve is 0, and if the process pages becomes incompressible
 * and there is not swap space left, then the process will be killed.
 */

/*  num_physpages is a Linux kernel variable.
 *  It is equal to the amount of "real memory" in MXT terminology
 *  e.g. If you have 512MB worth of DIMMs then real memory will be 1GB.
 *  cmp_num_physpages is the hardware "physical memory"
 *  available in the system.  Real is roughly two times physical due to
 *  compression.
 */
unsigned long cmp_num_physpages=0;

static unsigned long L3_size=0;
static spinlock_t fast_page_lock;
void * katsina_vaddr;
extern long mxt_bios_table[16];

/* address offsets of various fields in the mxt_bios_table
   copied from EBDA; sizes are in units of 4Kbytes */
enum {         
        _magic_=0,
        _version_=0x4, 
        _nr_real_,
        _nr_inhibit_,
        _nr_interrupt_,
        _mmio_base_,
        _L3_size_=0xc,
        _mxt_enabled_=0xc,
        _physical_size_=0x10,
        _real_start_=0x14
};

struct _mxtbios_addr {
        unsigned long address;
        unsigned long len;
};

#define address_region(nr,base) \
((struct _mxtbios_addr *)((nr)*sizeof(struct _mxtbios_addr)+(char *)(base)))

/*
 *  arch/i386/kernel/setup.c::setup_mxt_memory() function had
 *  copied the MXT bios data from EBDA to mxt_bios_table.  
 *  Now we scan this table to determine hardware characteristics.
 */
static int mxt_scan_bios(void)
{
	unsigned long *bp = mxt_bios_table;
	unsigned char *p;
	int nr_real;
	int nr_inhibit;
	int i;

	printk("mxt_scan_bios: enter\n");

	if( ! mxt_bios_table[0] )
		return -ENODEV;

	mxt_bios_found = 1;

	p = (unsigned char *)bp;

	/* PIC mode interrupt nr */
	mxt_irq = (unsigned int) p[_nr_interrupt_];

	/* real and inhibit regions of the memory */
	nr_real = p[_nr_real_];
	nr_inhibit = p[_nr_inhibit_];
	
	if ( *((unsigned long *)(p+_L3_size_)) & 0x80000000 ) {
		mxt_compression_enabled = 1;
		printk("<MC> Compression is enabled\n");
	}
	else {
		mxt_compression_enabled = 0;
		printk("<MC> Compression is disabled\n");
	}

	if (nr_real) {
		/* E820 reported physical mem size (1X) but
		 * additional memory was added to address
		 * space (in linux/arch/i386/kernel/setup.c)
		 */
		mxt_memory_expanded = 1;
	}
	else {
		/* No real mem regions were reported */
		mxt_memory_expanded = 0;
	}
	
	cmp_num_physpages = *((unsigned long *) (p+_physical_size_));
	/* take out the compression inhibited region(s), 
	   typically the lower 1MB 
	*/
	for(i=0; i<nr_inhibit; i++) 
		cmp_num_physpages -= 
			address_region(0+nr_real, p+_real_start_)->len;
	printk("<MC> cmp_num_physpages: 0x%08lx\n", cmp_num_physpages);

	/* This is for the testing guy who want to locks his 
	   pages mm/mlock.c */
	if ( cmp_num_physpages )
		max_lockedpages = cmp_num_physpages/2; 

	/* Parse the L3 cache size */
	L3_size = *((unsigned long *)(p+_L3_size_)) & 0x1fffff;
	printk("<MC> L3 pages: 0x%08lx\n", L3_size);

	return 0;
}

/*
 * - make the memory controller registers addressable.
 * - calculate swap_reserve amount.
 * - calculate compression management thresholds.
 */
static int mxt_setup_vm(void)
{
	unsigned long L3_fraction;
	struct sysinfo val;

	/* map the memory mapped registers of the chip
	 * to kernel virtual space 
	 */
	katsina_vaddr = ioremap_nocache( KATSINA_PHYS, KATSINA_LENGTH );
	if( ! katsina_vaddr )
		panic("<MC> couldn't virtual map MXT chip!");
	
	printk("<MC>: MXT chip virtual address: 0x%08lx\n", 
	       (unsigned long) katsina_vaddr);

	si_meminfo(&val);
	totalram_pages = val.totalram;
	totalhigh_pages = val.totalhigh;

	/* Reserve swap space for MXT purposes.
	 * swap_reserve and kernel_reserve will be in units of pages
	 * (note: this was in units of kilobytes in 2.2 kernel support 
	 * patch.) Due to uncompressed 
	 * 1MB region I am reserving more than necessary, but 1MB of disk
	 * space is small enough on that I will keep it like this.
	 */
	if ( mxt_compression_enabled && 
	     totalram_pages > cmp_num_physpages ) {
#ifdef SWAP_RESERVATION
		swap_reserve = totalram_pages - cmp_num_physpages;
#else
		swap_reserve = 0;
#endif
		if ( swap_reserve < 0 ) 
			swap_reserve = 0;

#ifdef MXT_KERNEL_RESERVATION
		/* Undef'ing this section makes kernel_reserve=0 the default.
		   In the Beta 2 kernel, changes in the alloc_page() API
		   busted the add_hidden_pages() call which I fixed.
		   To double the safety, I force kernel_reserve=0 here.
		*/

		/* this is the overcommitted memory, i.e. missing amount */
		kernel_reserve = totalram_pages - cmp_num_physpages; 

		/* relax the restrictions to account for the memory
		   already allocated or reserved by kernel at boot time 
		*/
		kernel_reserve -= (val.totalram - val.totalhigh) - 
			(val.freeram - val.freehigh);
		kernel_reserve = _MIN(kernel_reserve, 
				      val.freeram - val.freehigh );
		if ( kernel_reserve < 0 ) 
			kernel_reserve = 0;
#endif

		printk("<MC>: %ld pages reserved in swap space\n", 
		       swap_reserve);
		printk("<MC>: %ld pages may be hidden for kernel pages\n",
		       kernel_reserve);
		printk("<MC>: %ld compressed physical pages\n", 
		       cmp_num_physpages);
		printk("<MC>: %ld totalram_pages\n", 
		       totalram_pages);
		printk("<MC>: %ld totalhigh_pages\n", 
		       totalhigh_pages);
        }

	/* +1 for truncating to the highest integer */
	L3_fraction = ( (1000 * L3_size)/cmp_num_physpages )+1;
	
	/* Setup physical memory pressure thresholds.
	 * This is mostly guess work but conservative.
         * We know that we do not want physical to exceed 
	 * (1000-L3_fraction)/1000 utilization.  
	 * We add some fat to cover worst possible expansion.
         */
	if ( mxt_compression_enabled  ) {
		unsigned long th;
		/* th is about 4MB physical; +1 is needed
		 * for very large memory so that th!=0 */
		th = ((1000 * 1024) / cmp_num_physpages) + 1;

		/* dont use less than 1 percent */
		if (th < 10) th = 10;

		mc_th.intr    = 1000 - L3_fraction - th;
		mc_th.danger  = mc_th.intr - 2*th;
		mc_th.acquire = mc_th.danger - th;
		// mc_th.release = mc_th.acquire - th;
		// this is a temp workaround for kswapd burning too much cpu
		mc_th.release = mc_th.acquire - 1;
		mc_th.panic   = MC_TH_PANIC;
	} 
	else {
		mc_th.panic = mc_th.intr = mc_th.danger = 
			mc_th.acquire = mc_th.release = 1000;
	}
	return 0;
}

static void mxt_cleanup_vm(void)
{
	iounmap(katsina_vaddr);
}

/*
 *  Reads hardware registers and returns physical memory utilization 
 *  in fraction of 1000
 */
unsigned long katsina_memutil(void)
{
	unsigned long sectors;
	unsigned long numpages;
	unsigned long util;

	/* return some dummy number so that people's 
	   divisions don't fail */
	if ( ! mxt_compression_enabled )
		return 1000; 

	sectors  = READ_CTRL(SUR);
	numpages = sectors / (PAGE_SIZE/SECTOR_SIZE);

	/* the MXT box has 64GB max real address space; doing 
	   this arithmetic with 32-bit integers is tricky:
	   There are max 16 million pages. Dividing by 8
	   ensures that (1000 * num) < 2**32 
	*/
	util = ( 1000 *  (numpages/8) ) / (cmp_num_physpages/8);

	if ( !util ) 
		util = 1;

	return ( (unsigned long) util );
}

/*
 *   util is between 0 and 1000
 */
unsigned long memutil_to_sectors(unsigned long util) 
{
	return (  ( util * (cmp_num_physpages/1000) ) 
		  * (PAGE_SIZE/SECTOR_SIZE) );
}

static unsigned long memutil_to_pages(unsigned long util) 
{
	return ( util * (cmp_num_physpages/1000) );
}

/*
 *  Returns physical memory utilization in units of page
 */
unsigned long katsina_phys_used(void)
{
	long sectors;
	long numpages;

	/* return some dummy number so that divisions don't fail */
	if ( ! mxt_compression_enabled )
		return cmp_num_physpages; 

	sectors  = READ_CTRL(SUR);
	numpages = sectors / (PAGE_SIZE/SECTOR_SIZE);

	return ((unsigned long)numpages);
}

/* --------------------- Section 4 ------------------------------ */

/*
 * Fast page operation is a method for manipulating pages using the
 * Katsina chip "Fast Page Operation".  These merely update pointers 
 * in the chip instead of doing 4KB bulk transfers.  
 * These clear a page about 10 times faster than processor can.
 * See CNB 3.0 spec RMPR, RMPCR register pairs.
 *
 * FIXME: if compression is disabled or this is not an MXT box 
 * this function will silently return.  It will not do what it is
 * expected to do.  For this reason, we will not export it for 
 * others to use this nifty function.  We should put an emulator
 * in so that it performs its functions regardless if 
 * this is MXT hardware or not.
 */

/* there is a coherency bug for zero page op in pass-3 chips 
 * we will use "move page and clear source" operation instead */
#define FAST_ZERO_PAGE_BUG

inline 
void fast_page_op( unsigned long cmd,
		   unsigned long src_page, /* page number in RMPR reg */
		   unsigned long dst_page, /* page number in RMPCR reg */
		   unsigned long class_code,
		   int wait)
{
	unsigned long flags;
	unsigned int tmp;

	if (! mxt_compression_enabled ) 
		return;

	spin_lock_irqsave(&fast_page_lock,flags);

	class_code &=  RMPR_CLASSCODE_BITMASK;
	
	switch (cmd) {
	case RMPR_CMD_NOOP:
	case RMPR_CMD_ZEROPAGE:
	case RMPR_CMD_FLUSHINVALIDATE: 
	case RMPR_CMD_TRANSFERCC_CTT:
	case RMPR_CMD_MOVECTT2LAST:
	case RMPR_CMD_INVALIDATE_XFERCC:
		WRITE_CTRL( RMPR, RMPR_BUILD_REQUEST(cmd, class_code, 
						     src_page) );
		break;
	case RMPR_CMD_MOVEZEROSRC_CC:
	case RMPR_CMD_MOVEZEROSRC:
	case RMPR_CMD_SWAPPAGE_CC:
	case RMPR_CMD_SWAPPAGE:
		WRITE_CTRL( RMPCR, RMPCR_LOCK_BIT | dst_page); 
		WRITE_CTRL( RMPR, RMPR_BUILD_REQUEST(cmd, class_code, 
						     src_page) );
		break;
	default:
		printk("<MC>: error: fast page op undefined\n");
	}
	
	if (wait) {
		volatile unsigned long tmp;
		do {
			tmp = READ_CTRL( RMPR );
		} while ( (tmp & RMPR_CMD_MASK) );
	}

	/* Serialize instruction execution */
	cpuid(0,&tmp,&tmp,&tmp,&tmp);

	spin_unlock_irqrestore(&fast_page_lock, flags); 
}	

#ifdef FAST_ZERO_PAGE_BUG
/* don't touch this memory at all; helps fix the hardware bug */
static char scratch[PAGE_SIZE] __attribute__ ((aligned(PAGE_SIZE)));
#endif

#ifndef FAST_ZERO_PAGE_BUG
/*
 * Zero page operations is a method for clearing a page using a Katsina
 * chip "Fast Page Operation".  Clearing a page is merely updating a 
 * pointer instead of a 4KB bulk transfer.  It also invalidates the 
 * corresponding lines in L3 cache. See CNB 3.0 spec, RMPR register.
 */
inline 
static void cmpmem_zero_page(unsigned long page_no, int wait)
{
	unsigned long flags;
	unsigned int tmp;

	spin_lock_irqsave(&fast_page_lock,flags);
	
	/* zero a single physical page with page frame number <page_no>*/
	WRITE_CTRL( RMPR, RMPR_BUILD_REQUEST(RMPR_CMD_ZEROPAGE,
					     0, page_no) );

	if (wait) {
		volatile unsigned long tmp;
		do {
			tmp = READ_CTRL( RMPR );
		} while ( (tmp & RMPR_CMD_MASK) );
	}

	/* Serialize instruction execution */
	cpuid(0,&tmp,&tmp,&tmp,&tmp);

	spin_unlock_irqrestore(&fast_page_lock, flags); 
}	
#else
inline
static void cmpmem_zero_page(unsigned long page_no, int wait)
{
	unsigned long va = PAGE_ALIGN((unsigned long)scratch);
	fast_page_op( RMPR_CMD_MOVEZEROSRC_CC, page_no, 
		      __pa(va)>>PAGE_SHIFT, 0, 1);
}
#endif  /* FAST_ZERO_PAGE_BUG */


#include <asm/page.h>
#include <linux/mm.h>
#include <linux/highmem.h>

void *mxt_clear_virtpage(unsigned long vaddr)
{
	/* First 1 Meg does not compress on Katsina */

	unsigned long paddr = __pa(vaddr);

	if ( paddr < MIN_ADDR_FASTCLEAR || !mxt_compression_enabled ) {
		memset((void*)vaddr, 0, PAGE_SIZE);
	}
	else {
		cmpmem_zero_page(paddr>>PAGE_SHIFT, 1);
	}
	return ( (void *)vaddr );
}

void mxt_clear_page(struct page *page)
{
	unsigned long page_nr = (unsigned long)(page - mem_map);

	if ( !VALID_PAGE(page) )
		BUG();

	if ( mxt_compression_enabled && 
	     page_nr > (MIN_ADDR_FASTCLEAR >> PAGE_SHIFT) ) {
		cmpmem_zero_page(page_nr, 1);
		return;
	}

	if ( ! PageHighMem(page) )
		clear_page( page_address(page) );
	else
		clear_highpage(page);
}

void mxt_clear_highpage(struct page *page)
{
	mxt_clear_page(page);
}

/* arch/i386/kernel/mxt.c */
extern void (*mxt_fast_clear)(struct page *);

/* make the function available for general use */
static void mxt_hook_clear_page(void)
{
	unsigned long flags;
	spin_lock_irqsave(&fast_page_lock,flags);
	mxt_fast_clear = mxt_clear_page;
	spin_unlock_irqrestore(&fast_page_lock, flags);
}

static void mxt_unhook_clear_page(void)
{
	unsigned long flags;
	MCPRINTK("<MC> mxt_unhook_clear_page: enter\n");
	spin_lock_irqsave(&fast_page_lock,flags);
	mxt_fast_clear = NULL;
	spin_unlock_irqrestore(&fast_page_lock, flags);
}

/* --------------------- Section 5 ------------------------------ */

/*
 * Interrupt support: if SUR register exceeds SUTLR register controller 
 * will fire an interrupt which is an indication of physical memory 
 * pressure.  
 * We're already tracking this by polling; but interrupts will catch the 
 * pressure in between two polls if pressure is increasing fast. 
 */

static struct tq_struct sutlr_task;

static void execute_sutlr_task(void * n)
{
	if ( katsina_memutil() > mc_th.panic )
		panic("<MC> cannot reduce physical memory pressure!\n");
	cmp_grab_cpus();
	memcompress_check();
	return;
}

static void cmpintr_isr(int irq, void *dev_instance, struct pt_regs *regs)
{
	volatile unsigned long status;

	MCPRINTK("cmpintr_isr: enter\n");

	status = READ_CTRL(ESR);

	mxt_stats.esr = status;
	mxt_stats.esr_sticky |= status;

	if ( status & ESR_SUTLR_MASK ) {  
		/* there is physical memory pressure */
		mxt_disable_sutlr_interrupt();
		mxt_clear_sutlr_interrupt();
		queue_task(&sutlr_task, &tq_immediate);
		mark_bh(IMMEDIATE_BH);
	} 
	else {
		mxt_clear_all_interrupts();
	}
}

static struct pci_dev *pdev;

static int  mxt_setup_interrupts(void)
{
	unsigned char pci_bus, pci_device_fn, pci_intr_pin, pci_intr_line;
	volatile unsigned long status;

	if ( ! mxt_bios_found ) {
		printk("<MC>: No MXT bios found\n");
		return 1;
	}

	if ( ! mxt_compression_enabled ) {
		printk("<MC>: Compression disabled\n");
		return 1;
	}

	if ( ! pcibios_present() ) {
		printk("<MC>: No pci bios, no interrupts! Polling only\n");
		return 1;
	}

	if(pcibios_find_device( IBM_PCI_VENDORID_COMPMEM,
				IBM_PCI_DEVICEID_COMPMEM,
				0,
				&pci_bus,
				&pci_device_fn) ) {
		printk("<MC>: no PCI; no interrupts!\n");
		return 1;
	}

	mxt_device_found = 1;

	pdev = pci_find_slot(pci_bus, pci_device_fn);
	pcibios_read_config_byte(pci_bus, pci_device_fn ,
				 PCI_INTERRUPT_PIN, &pci_intr_pin);
	pcibios_read_config_byte(pci_bus, pci_device_fn, 
				 PCI_INTERRUPT_LINE, &pci_intr_line);

	printk("<MC>: bus=%x, devfn=%x, vid=%04x, did=%04x\n"
	       "      class=%x, irq=%d, pin=%x, line=%x\n",
	       pci_bus,
	       pci_device_fn,
	       pdev->vendor,
	       pdev->device,
	       pdev->class,
	       pdev->irq,
	       pci_intr_pin,
	       pci_intr_line);

	/* 
	 * Initialize for interrupts 
	 */
	// WRITE_CTRL(ESMR0, 0); /* disable interrupts connected to NMI */

	/* physical memory fatal UE; Brett says it works. Other 
	   ServerWorks bits occasionally will light up */
	WRITE_CTRL(ESMR0, 1<<5); 

	/* Driver is supposed to handle SUTLR interrupts
	 * this will turn off SUTLR interrupts from reaching NMI */
	WRITE_CTRL(ESMR0, (READ_CTRL(ESMR0) & 
			   ~((unsigned long)ESMR1_SUTLR_INT)));
	WRITE_CTRL(ESMR1, 0); /* disable interrupts connected to INTx */
	mxt_clear_all_interrupts(); /* clear existing ones */
	WRITE_CTRL(SUTLR, 0xffffffff);  /* set SUTLR to a high value */
	WRITE_CTRL(SUTHR, 0xffffffff);  /* set SUTHR to a high value */

	/* 
	 * Initialize the immediate tasks that ISR will dispatch 
	 */
	/* sutlr_task.next = NULL; */
	sutlr_task.sync = 0;
	sutlr_task.routine = execute_sutlr_task;  
	sutlr_task.data = 0;

	/*
	 * Try connecting to the interrupt
	 */

	if ( pdev->irq ) {
		/* Note: in uniprocessor build of Linux
		 * pdev->irq comes back as zero. No clue why. 
		 * In SMP build with APICs on pdev->irq comes back as a 
		 * valid number. Must have to do with PIC vs APIC modes.
		 * Thus, in uni build I pick the IRQ from $MXT table in
		 * EBDA.  In smp build I use pdev->irq */
		mxt_irq = pdev->irq;
	}
		
	if (request_irq(mxt_irq, &cmpintr_isr, 
			0, "MXT Memory Controller", pdev)) {
		printk("<MC>: request_irq() failed\n");
		return 1;
	}

	printk("<MC>: MXT Memory Controller interrupt connected IRQ=%d\n", 
	       mxt_irq);

	/*
	 * set the interrupt threshold which when exceeded
	 * memory controller will send an interrupt
	 */
	status = memutil_to_sectors(mc_th.intr);
	WRITE_CTRL(SUTLR, status);
	printk("<MC>: Set SUTLR=%08lx\n", status);

	mxt_enable_sutlr_interrupt();
	status = READ_CTRL(ESMR1);
	printk("<MC>: Set ESMR1=%08lx\n", status);

	return 0;
}                         

static int mxt_disconnect_interrupts(void)
{
	mxt_disable_sutlr_interrupt();
	mxt_clear_sutlr_interrupt();
	free_irq(mxt_irq, pdev);
	return 0;
}

/* --------------------- Section 6 ------------------------------ */

/*
 *  Generic Memory Compression Support
 */

#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/version.h>
#include <linux/highmem.h>
#include <linux/cache.h>

#include <asm/pgtable.h>
#include <asm/mxt.h>
// #include <asm/spinlock.h>

/* cmp_idle thread related */
#include <linux/delay.h>
#include <linux/sched.h>

/* cmptimer related */
#include <linux/tqueue.h>

/* cmp_idle thread related */
#define CMP_NMAX_TASKS  32
#define CMP_IDLE_RUN    1
#define CMP_IDLE_SLEEP  0

static struct task_struct *cmp_idle_task[CMP_NMAX_TASKS];
static volatile int cmp_idle_run=CMP_IDLE_SLEEP;

/* static struct task_struct *cmp_zerod_task[CMP_NMAX_TASKS]; */

static struct task_struct *cmp_eatmem_task[CMP_NMAX_TASKS];

/* cmptimer related */
static void start_periodic_task(void);
static void stop_periodic_task(void);

/* for stalling processes */
static void zap_bad_processes(void);

static spinlock_t mc_lock __cacheline_aligned;

/* 
 * We basically check for 5 different ranges of memory utilization 
 * between 0 and 100% of memory utilization. 
 * Consider the 100 to be some max number ..
 *
 *    [  1   |   2   |   3   |   4   |  5  ]
 *    0   th_rel  th_acq   th_dan  th_int 100
 * 
 * we always try to keep the system operating in 1-3
 * in phase 1 there is nothing to do...
 * in phase 2 we are willing to release a few pages if necessary/possible
 * in phase 3 we are trying to acquire pages if possible
 * in phase 4 we are bringing the utilization back to 3 threshold
 * in phase 5 we are in high danger mode, we should basically block CPUs
 */

extern unsigned long totalreserved;

static inline unsigned long get_memutil(void)
{
	/* memutil is normalized to 1000 */
	unsigned long memutil = katsina_memutil();
	if (memutil > mc_th.max) {
		mc_th.max = memutil;
		if (mc_th.max > mc_th.intr)
			MCPRINTK("mc_th.max = %ld\n", mc_th.max);
		if (mc_th.max > mc_th.panic )
			panic("<MC> memory pressure!\n");
	}
	return memutil;
}

unsigned long get_used_pages(void)
{
	return (totalram_pages - nr_free_pages());
}

static inline int memutil_2_state(unsigned long memutil)
{
	/* classify the current memory utilization into a 
	   particular state */
	int state;

	if      (memutil < mc_th.release)
		state = 1;

	else if (memutil < mc_th.acquire)
		state = 2;

	else if (memutil < mc_th.danger)
		state = 3;

	else if (memutil < mc_th.intr)
		state = 4;

	else
		state = 5;

	return state;
}

/* 
 * mu:     current physical memory utilization
 * maxmu:  maximum desired physical utilization
 * if mu exceeds maxmu we will create a free page pool deficieny
 * so that kswapd and friends can bring some pages back to the pool.
 *
 * algorithm: 
 *    assume compression rate piecewise linear and stable in time
 *    then    
 *            maxmu       maxpages
 *            -----   =   --------
 *            mu          usedpages
 *
 * this equation drives how many pages we have to take out of the system.
 */


#define HIGH_DECAY  ((tuning.high_decay*1024)/PAGE_SIZE)
#define LOW_DECAY   ((tuning.low_decay*1024)/PAGE_SIZE)
/*
 * Slowly release held pages at a rate of one percent
 * per period (period is 100ms) with limits
 */
static inline long mc_decayed_release(long pages)
{
	long decay;

	if ( pages <= 0 )	
		return 0;

	decay = pages/500;  /* 2 percent decay per second */
		
	if ( decay > HIGH_DECAY )
		decay = HIGH_DECAY;
	else if ( decay < LOW_DECAY )
		decay = LOW_DECAY;

	pages = _MAX (pages - decay, 0);
	
	return pages; 
}

/*
 * mu is current physical memory utilization
 * maxmu is target physical memory utilization
 * this function will calculate how many pages we
 * should take away from the system to reduce
 * physical memory pressure.
 */
static inline long mc_adjust_check(unsigned long mu, unsigned long maxmu)
{
	long adj;
	long usedpages;
	long maxpages;
	long nrfree;
	long tmp;
	long actual_reserved; 
	long ceil;
	long hidden;
	
	/* in steady state nr_rsrv_pages == actual_reserved.
	 * in transient state nr_rsrv_pages might be much larger
	 * than actual_reserved because eatmem threads take
	 * time to allocated and clear memory */

	nrfree = nr_free_pages();
	usedpages = totalram_pages - nrfree;

	/*
	 * Maxpages is max number of used pages we should allow for.
	 * We are not going to count the pages we reserved 
	 * because they are zeroed and therefore they do 
	 * not contribute to the physical utilization mu 
	 */
	hidden = atomic_read(&nr_hidden_pages);
	hidden = (hidden/NR_CPUS)*NR_CPUS;

	actual_reserved = atomic_read(&total_pages_held) + hidden;

	maxpages = (long)maxmu * ((usedpages-actual_reserved) / (long)mu);
	
	/* pages we should take out from the system 
	 *  to meet (mu <= maxmu) target  
	 */
	tmp  = totalram_pages - maxpages; 
	tmp  = ( (tmp+NR_CPUS-1)/NR_CPUS )*NR_CPUS;

	/* 
	 * sanity check: no need to reserve more than the missing 
	 * amount of physical memory.  But the reason for this equation 
	 * (as opposed to (usedpages+nrfree)/2) is because,
	 * a) the user might have limited memory size to less than 
	 *    the real memory size.  For example, 512MB physical
	 *    and 1024MB real is in the system.  But user specifies 
	 *    mem=700M in /etc/lilo.conf.  
	 * b) we need the pressure the utilization down below mc_th.danger
	 * c) we add 10% to cover any slop we might have forgotten
	 *    such as SetPageReserved() pages.
	 * d) L3 contents need to be flushed out to the physical memory; 
	 *    (although fast page clear does that.)
	 */
	ceil = (long)totalram_pages 
		- ( (long)mc_th.release * ((long)cmp_num_physpages / 1000))
		+ (long)cmp_num_physpages / 10L + (long)L3_size;
	if ( tmp > ceil )
		tmp = ceil;

	/* sanity check: cannot reserve negative count of pages  */
	if ( tmp < 0) 
		tmp = 0;

	adj = tmp - actual_reserved;

	/* positive means grab more pages than previously;
	 * negative means release some pages */

	/* let the decay routine release the pages; we are not going 
	 * to do it here (adj < 0 case).  And if the positive 
	 * adjustment is too small don't bother (adj > 0 case) */
	if ( adj < (L3_size/32) )  
		return 0;

	/* nr_rsrv_pages is what memeater threads should grab
	 * hidden is what page hiding threads are hiding 
	 */
	nr_rsrv_pages = tmp - hidden;
	nr_pages_per_thread = nr_rsrv_pages/smp_num_cpus;

	return adj;
}


/* If memory utilization is running up too fast we will try
 * to bump it by quickly allocating pages even when we are in the
 * safe state 1.  
 */
static long bump_fast(long curr_util, long prev_util, long curr_held)
{
	long tmp = memutil_to_pages (curr_util - prev_util);
	tmp = _MIN( curr_held + tmp, totalram_pages - cmp_num_physpages );
	return ( _MAX( tmp, 0 ) );
}

static int release_delay    __cacheline_aligned;
static volatile int cur_state __cacheline_aligned;
static atomic_t prev_state  __cacheline_aligned = { 0 }; 
static atomic_t prev_util   __cacheline_aligned = { 0 }; 

int memcompress_check(void)
{
	long memutil;
	unsigned long flags;
	long th_bump;
	int bumped=0;
	int state;

	memutil = get_memutil();
	cur_state = state = memutil_2_state(memutil);

	if ( mxt_printk )
		if ( state != atomic_read(&prev_state) ) {
			/* eliminate excessive printing */
			MCPRINTK("memcompress_check mu=%ld state=%d\n", 
				 memutil, state );
			atomic_set (&prev_state, state);
		}

	/* bump the bad ass processes trying to run up the 
	 * compressed mem util 
	 */
	th_bump = tuning.bump_rate/(PAGE_SIZE/1024); /*pages per period*/
	th_bump = (1000 * th_bump) / (long) cmp_num_physpages;

	if (((long)memutil - (long)atomic_read(&prev_util) ) > th_bump) {
		spin_lock_irqsave(&mc_lock,flags);
		nr_rsrv_pages = bump_fast(memutil, atomic_read(&prev_util),
					  atomic_read(&total_pages_held) );
		nr_pages_per_thread = nr_rsrv_pages/smp_num_cpus;
		bumped=1;
		spin_unlock_irqrestore(&mc_lock, flags);
	}
	atomic_set (&prev_util, memutil);

	/* 
	 * This is main decision point to determine if
	 * there is compressed memory pressure (or not),
	 * and then allocate/clear pages (or free pages)
	 */
	switch (state) {
		
	case 1: /* safe state */

		if ( bumped )
			wake_up_cmp_eatmem();

		/* release pages back slowly  */
		if ( !bumped && (atomic_read(&total_pages_held) > 0) ) {
			spin_lock_irqsave(&mc_lock,flags);

			if ( --release_delay < 0 ) {
				release_delay = 0;
				nr_rsrv_pages = mc_decayed_release( 
					atomic_read(&total_pages_held) );
				nr_pages_per_thread = 
					nr_rsrv_pages/smp_num_cpus;
				wake_up_cmp_eatmem();
			}
			spin_unlock_irqrestore(&mc_lock, flags);
		}
		break;

	case 2: /* mc_th.release */

		if ( bumped )
			wake_up_cmp_eatmem();

		break;

	case 5: /* mc_th.intr: we are near edge; start blocking CPUs */

		spin_lock_irqsave(&mc_lock,flags);

		cmp_grab_cpus();

		zap_bad_processes();

		/* bring it down to acquire threshold */
		mc_adjust_check (memutil, mc_th.acquire);

		wake_up_cmp_eatmem();

		release_delay = (tuning.delay * HZ)/tuning.period;

		spin_unlock_irqrestore(&mc_lock, flags);

		break;

	case 4: /* mc_th.danger */

		spin_lock_irqsave(&mc_lock,flags);

		zap_bad_processes();

		if ( bumped )
			wake_up_cmp_eatmem();

		/* bring it down to acquire threshold */
		if ( mc_adjust_check (memutil, mc_th.acquire) > 0 ) {
			wake_up_cmp_eatmem();
		}

		release_delay = (tuning.delay * HZ)/tuning.period;

		spin_unlock_irqrestore(&mc_lock, flags);

		break;

	case 3: /* mc_th.acquire */

		spin_lock_irqsave(&mc_lock,flags);

		if ( bumped )
			wake_up_cmp_eatmem();

		/* bring it down to acquire threshold */
		if ( mc_adjust_check (memutil, mc_th.acquire) > 0 ) {
			wake_up_cmp_eatmem();
		}

		release_delay = (tuning.delay * HZ)/tuning.period;

		spin_unlock_irqrestore(&mc_lock, flags);

		break;

	}

	return state;
}

/* --------------------- Section 7 ------------------------------ */

/*
 * cmp_idle thread routines prevent compressed memory pressure
 * by grabbing cpus. These routines
 * 1) start cmp_idle threads one per cpu, 
 * 2) increase or decrease priority of big tasks, kswapd, 
 *    and cmp_idle threads as needed
 * 3) grab cpus at higher than big task priority levels
 * 4) grab cpus at higher than default priority level
 */

static int cmp_idle_init(void)
{
	int t;

	if ( !mxt_compression_enabled ) return 0;

	printk("<MC> Starting cpu blocker threads: %d\n", smp_num_cpus);  
	for (t=0; t < smp_num_cpus; t++) {
		kernel_thread(cmp_idle, (void*)t, 
			      CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
	}
	return 0;
}

/*
 * Idle tasks exist one or more per CPU.  When there is extreme physical 
 * memory pressure due to low compressibility, they do nothing but eat
 * CPU cycles to prevent other tasks from running and therefore 
 * further degrading compressibility until kswapd cleans up the mess.
 *
 * We could have changed the scheduler.c for this purpose, but this code
 * results in less changes in the original kernel.  These threads
 * are only for compression emergency anyway and not supposed to be running
 * if we have enough memory in the system.
 */
int cmp_idle(void *cpu_num)
{
	struct task_struct *tsk;
	int thread_id = (int) cpu_num;
	char banner[] = "mxtspn0"; 

	daemonize();

	lock_kernel();
	tsk = current;
	tsk->session = 1;
	tsk->pgrp = 1;
        sigfillset(&tsk->blocked);
        tsk->tty = NULL;    /* get rid of controlling tty */
	tsk->policy = SCHED_OTHER;
	set_user_nice(tsk, -18);
	banner[ sizeof("mxtspn0") - 2 ] += thread_id;
	strcpy(tsk->comm, banner );
	cmp_idle_task[thread_id] = tsk;
	unlock_kernel();

	if ( !thread_id ) 
		start_periodic_task(); /* start one periodic task only */

looping:
	if ( cmp_idle_run == CMP_IDLE_SLEEP ) { /* must go to sleep */
		interruptible_sleep_on(&cmp_idle_wait);
	}
	else {
		/* we're told to wake up and begin wasting cpu cycles */
		MCPRINTK("%s up\n", banner);
		while(1) {      
			int i;
			/* waste cpu cycles */
			for(i=0; i<tuning.spin_period; i++)
				udelay(1000000/HZ); 

			if ( current->need_resched ) {	
				set_current_state(TASK_RUNNING);
				schedule();
			}
			
			if ( !thread_id ) {
				/* 
				 * if the first thread detects that 
				 * utilization drop below the danger 
				 * threshold it will tell other threads
				 * to quit spinning
				 */
				unsigned long memutil = get_memutil();
				if ( memutil_2_state(memutil) < 4 ) {
					/* 
					 * Return to state 4 to add some 
					 * for some hysterisis.
					 * This should keep running 
					 * until util < 4
					 */
					unsigned long flags;
					spin_lock_irqsave(&mc_lock,flags);
					cmp_release_cpus();
					spin_unlock_irqrestore(&mc_lock,
							       flags);
					mxt_clear_sutlr_interrupt();
					mxt_enable_sutlr_interrupt();
				}
			}
			if ( cmp_idle_run == CMP_IDLE_SLEEP ) 
				break;
		}
		MCPRINTK("%s down\n", banner);
	}
	goto looping;
}

static void cmp_grab_cpus(void)
{
	cmp_idle_run = CMP_IDLE_RUN;
	if ( waitqueue_active( &cmp_idle_wait ) )
		wake_up_interruptible_all(&cmp_idle_wait);
}

static void cmp_release_cpus(void)
{
	if ( cmp_idle_run == CMP_IDLE_RUN )
		cmp_idle_run = CMP_IDLE_SLEEP;  
}


/* --------------------- Section 8 ------------------------------ */

/*
 * Timer routine calls kernel compression service periodically
 */

#ifdef MXT_OLD_TIMERS
static struct tq_struct  tq; 
static int tq_counter=2000; /* any nonzero initial value will do */

/* This routine is called each time tq_timer queue is run
 * The routine will enqueue itself again
 */
static inline void periodic_task(void * run)
{

	if ( ! run ) {
		tq.routine = NULL;
		return;
	}

	if ( --tq_counter <= 0 ) { /* HZ/tuning.period times a second */
		memcompress_check();
		tq_counter = HZ/tuning.period;
	}

	/* enqueue the task to the timer queue which runs HZ/second */
	queue_task( &tq, &tq_timer ); 
}

static void start_periodic_task(void)
{
	tq.routine = periodic_task; /* function to execute */
	tq.data = (void*) 1;        /* argument of the function */
	queue_task(&tq, &tq_timer);
}

static void stop_periodic_task(void)
{
	unsigned long flags;
	
	/* this tells periodic task to stop during its last execution */
	spin_lock_irqsave(&tqueue_lock, flags);
	tq.data = (void*) 0; 
	spin_unlock_irqrestore(&tqueue_lock, flags);

	/* wait until the task stops
	 * tq.routine == NULL is an indication that it stopped 
	 */
	while ( tq.routine ) {
		set_current_state(TASK_RUNNING);
		yield();
	}
}
#else  /* MXT_OLD_TIMERS */

#include <linux/timer.h>

#define RUN_AT(x)   (jiffies + (x))
static struct timer_list tq;


/* This routine is called each time tq_timer queue is run
 * The routine will enqueue itself again
 */
static inline void periodic_task(unsigned long data)
{
	if ( data ) {

		memcompress_check();	

		/* repeat so many ticks a second */ 
		tq.expires = RUN_AT( HZ/tuning.period );
		add_timer(&tq);
	}
}

static void start_periodic_task(void)
{
	init_timer(&tq);
	tq.function = &periodic_task; /* function to execute */
	tq.data = 1;                  /* argument of the function */
	tq.expires = RUN_AT(10*HZ);   /* wait 10 seconds initially */
	add_timer(&tq);
}

static void stop_periodic_task(void)
{
	tq.data = 0; 
	del_timer_sync( &tq );
}
#endif  /* MXT_OLD_TIMERS */


/* --------------------- Section 9 ------------------------------ */
/*
 *  Page clearing threads reduce compression pressure by allocating
 *  pages from the system and clearing them with zeros.
 *  A 4KB page filled with zeros occupies only 64 bytes in physical
 *  memory. 
 */

/* #define EATSIZE  ((long)freepages.high) */
#define EATSIZE  256

static long __alloc_clear(long nr_pages,
			  unsigned int gfp_mask, struct list_head * head)
{
	long count=0;
	long cleared=0;
	struct page * page;

	while ( cleared < nr_pages ) {

		page = alloc_pages(gfp_mask,0);
		if ( ! page ) {
			MCPRINTK("failed allocation %s %d %8x\n", 
				 __FILE__, __LINE__, gfp_mask);
			if (waitqueue_active(&kswapd_wait))
				wake_up_interruptible(&kswapd_wait);
			wakeup_bdflush();
			run_task_queue(&tq_disk);
			break;
		}
		
		if ( fclr || cur_state == 5)
			mxt_clear_page(page);
		else
			clear_highpage(page);
		++cleared;
		list_add( &page->list, head );

		if ( current->need_resched ) {
			set_current_state(TASK_RUNNING);
			schedule();
		}

		/* check once in a while if there is shortage */
		if ( count++ < pages_min/4  )
			continue;

		count=0;

		/* if we're down to last few pages ???? */
		if ( nr_free_pages() < pages_high ) {

			if (waitqueue_active(&kswapd_wait))
				wake_up_interruptible(&kswapd_wait);
			
			wakeup_bdflush();
			run_task_queue(&tq_disk);

			set_current_state(TASK_RUNNING);
			yield();
		}

	}
	return cleared;
}

/*
 * Retrieve the pages_min and pages_low watermarks
 */
void low_mem_watermarks(void)
{
	zone_t *zone;
	pg_data_t *pgdat = pgdat_list;

	pages_min = 0;
	pages_low = 0;
	pages_high = 0;

	while (pgdat) {
		/*
		 * Always allocate from the last zone with GFP_HIGHUSER so
		 * let's see the watermarks from such point of view
		 */
		int point_of_view = pgdat->nr_zones-1;

		for (zone = pgdat->node_zones; zone < pgdat->node_zones + pgdat->nr_zones; zone++) {
			pages_min += zone->pages_min;
			pages_low += zone->pages_low;
			pages_high += zone->pages_high;
		}

		pgdat = pgdat->node_next;
	}
}

/* 
 * Allocate, clear and add pages to the list.
 * Use hardware fast clear to clear & invalidate L3 cache. 
 * This helps reduce compression pressure.
 * Returns number of pages in processed. 
 *
 */
static long alloc_clear(long nr_pages, struct list_head * held_list)
{
	int total;
	int ask;
	int remainder=nr_pages;
	long cleared;
	volatile long initial = nr_pages_per_thread;

	while (1) {

		total = nr_free_pages() + nr_inactive_clean_pages();

		/* this will try pushing usage down to pages_min */
		ask = _MAX( total - pages_min, 0L ) / smp_num_cpus;

		/* don't alloc more than requested or more than
		 * EATSIZE per iteration */
		ask = _MIN( ask, remainder );
		ask = _MIN( ask, EATSIZE );

		/* ZONE_HIGHMEM inclusive of ZONE_NORMAL and ZONE_DMA
		 * see mm/page_alloc.c::build_zonelists() */
		cleared = __alloc_clear (ask, GFP_HIGHUSER, held_list);

		atomic_add (cleared, &total_pages_held);
		remainder -= cleared;
		
		if ( remainder <= 0 )
			return (nr_pages-remainder);

		/* nr pages to be held has been reduced; quit now */
		if ( nr_pages_per_thread < initial ) {
			return (nr_pages-remainder);
		}

		/* yield in anticipation of kswapd and friends making
		   new free pages */
		if ( !cleared ) {
			set_current_state(TASK_RUNNING);
			schedule();
		}

	}
}


/* 
 * all=1 means release everything
 */
static long release_cleared (long nr_pages,
			     struct list_head * held_list,
			     int all)
{
	struct page * page;
	long released=0;
	long count=0;
	volatile long initial = nr_pages_per_thread;

	/* 
	 * Note that we don't want to create a flurry of atomic
	 * updates on the bus.
	 * So we will subtract once in a while in bigger quantities
	 */

	while( !list_empty(held_list) && ((released < nr_pages) || all) ){

		page = list_entry(held_list->next, struct page, list);
		list_del_init( &page->list );
		__free_page(page);
		++released;
		++count;

		/* if someone wants more pages to be held
		 * must quit freeing and return immediately 
		 */
		if ( nr_pages_per_thread > initial ) {
			atomic_sub (count, &total_pages_held);
			return released;
		}
		
		if ( current->need_resched && !all ) {
			atomic_sub (count, &total_pages_held);
			count = 0;
			schedule();
		}
	}
	atomic_sub (count, &total_pages_held);
		
	return released;
}

/* 
 * use before module unload 
 */
static void release_held_pages(void)
{
	printk("<MC> release_held_pages: enter\n");

	nr_rsrv_pages = 0;
	nr_pages_per_thread = 0;
	wake_up_cmp_eatmem();
	set_current_state(TASK_RUNNING);
	yield();
}


static int cmp_eatmem(void *cpu_num)
{
	struct task_struct *tsk = current;
	int myid = (int) cpu_num;
	char banner[] = "mxtclr0"; 
	LIST_HEAD(held_list);
	long held=0;
	long count=0;

	INIT_LIST_HEAD(&held_list);

	daemonize();

	lock_kernel();
	tsk->session = 1;
	tsk->pgrp = 1;
        sigfillset(&tsk->blocked);
        tsk->tty = NULL;    /* get rid of controlling tty */
	tsk->policy = SCHED_OTHER;
	set_user_nice(tsk, -19);
	banner[ sizeof("mxtclr0") - 2 ] += myid;
	strcpy(tsk->comm, banner );
	cmp_eatmem_task[myid] = tsk;
	/* bind to a processor */
	tsk->cpus_allowed &= cpu_logical_map(1<<myid); 
	unlock_kernel();

	while(1) {
		long nr;

		interruptible_sleep_on(&cmp_eatmem_wait);

		if ( held < 0 ) 
			MCPRINTK("<MC> sanity check: held = %ld\n", held);

	more_pages_to_alloc:
		
		nr = nr_pages_per_thread;
		
		if ( ! nr ) {
			/* free all of them */
			count = release_cleared(0, &held_list, 1);
			held -= count;
		} else if ( nr > held ) {
			/* allocate and clear pages */
			count=alloc_clear(nr-held, &held_list);
			held += count;
		}
		else {
			/* free some of them */
			count = release_cleared(held-nr, &held_list, 0);
			held -= count;
		} 

		/* nr_pages_per_thread changed while we were releasing 
		   or allocating.  or we didn't get enough
		   go at it again */
		if (nr != nr_pages_per_thread || held < nr_pages_per_thread)
			goto more_pages_to_alloc;
	}
}

static int cmp_eatmem_init(void)
{
	int t;
	
	if ( !mxt_compression_enabled ) 
		return 0;

	low_mem_watermarks();
	printk("<MC> Memory watermarks min,low,high=%d,%d,%d\n", 
	       pages_min, 
	       pages_low, 
	       pages_high);

	printk("<MC> Starting page eater threads: %d\n", smp_num_cpus);  
	for (t=0; t < smp_num_cpus; t++) {
		kernel_thread(cmp_eatmem, (void*)t, 
			      CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
	}
	return 0;
}


static void wake_up_cmp_eatmem(void)
{
	/* this should wake up all threads waiting 
	 * on this queue */
	if ( waitqueue_active( &cmp_eatmem_wait ) )
		wake_up_interruptible_all( &cmp_eatmem_wait );
}


/* --------------------- Section 10 ------------------------------ */
/*
 * Predict processes causing compression pressure.
 * Based on oom_kill.c
 */

static inline int is_heavy_process(struct task_struct *p)
{
	unsigned long th;
	
	/* kernel processes should not be stalled */
	if ( ! p->mm )
		return 0;

	/* L3 contents is not accounted for in the
	 * compressed memory utilization; 
	 * It means a 32 MB spike in the worst case L3 flush 
	 */
	th = L3_size;

	if ( p->mm->total_vm > th )
		return 1;

	return 0;
}

/* Make sure that this task is not considered
 * in the next round of scheduling.  
 */
static inline void force_reschedule(struct task_struct *tsk)
{
	if ( tsk->time_slice )
		tsk->time_slice = 0;
}


/*
 * Steal some scheduling ticks from bad processes. We need to call this 
 * repeatedly in every poll. It seems like a waste but we're in the danger
 * zone and we're intentionally wasting CPU cycles with cmp_idle threads 
 * anyway. I can also suspend processes, but then I need to remember 
 * which processes I suspended later and then what happens if the process
 * died. This method is stateless. 
 */
static void zap_bad_processes(void)
{
	struct task_struct *p = NULL;

	read_lock(&tasklist_lock);
	for_each_task(p) {

		if ( ! p->pid )
			continue;

		if ( ! is_heavy_process(p) )
			continue;

		force_reschedule(p);
	}
	read_unlock(&tasklist_lock);
}

/* --------------------- Section 11 ------------------------------ */

/* These routines are for covering the worst case condition of 
 * incompressible kernel pages.  Discussion per Alan Cox on 5/22/2001. 
 * For every non-user page (!= GFP_HIGHUSER or GFP_USER) we will allocate
 * and hide away one page.  The hidden page will be cleared.
 * This ensures that there is a matching cleared page for every
 * incompressible page.  Therefore maintaining 2 to 1 compression ratio.
 */
#ifdef CONFIG_MXT_HIDDEN_PAGES
static int cmp_hide_pages(void *cpu_num)
{
	struct task_struct *tsk = current;
	struct page *page;
	char banner[] = "mxtrsrv"; 
	LIST_HEAD(held_list);

	INIT_LIST_HEAD(&held_list);

	daemonize();

	lock_kernel();
	tsk->session = 1;
	tsk->pgrp = 1;
        sigfillset(&tsk->blocked);
        tsk->tty = NULL;    /* get rid of controlling tty */
	tsk->policy = SCHED_OTHER;
	set_user_nice(tsk, 0);
	strcpy(tsk->comm, banner );
	unlock_kernel();

	set_current_state(TASK_UNINTERRUPTIBLE);
	schedule_timeout( HZ * 60 );  

	while(1) {
		int m,n;

		interruptible_sleep_on(&cmp_hide_pages_wait);

		m = atomic_read(&nr_hide_pages);
		n = atomic_read(&nr_hidden_pages);

		m = _MIN(m, kernel_reserve);

		if ( m > n ) {
			int i;
			for(i=0; i<(m-n); i++) {
				page = alloc_pages(GFP_HIGHUSER,0);
				list_add( &page->list, &held_list );
				atomic_inc(&nr_hidden_pages);

				if ( fclr )
					mxt_clear_page(page);
				else
					clear_highpage(page);
				
				if ( current->need_resched ) {
					set_current_state(TASK_RUNNING);
					yield();
				}
			}
		}
		else {
			int i;
			for(i=0; i<(n-m); i++) {
				
				if ( list_empty(&held_list) )
					break;

				page = list_entry(held_list.next, 
						  struct page, list);
				list_del_init( &page->list );
				__free_page(page);
				atomic_dec(&nr_hidden_pages);
				
				if ( current->need_resched ) {
					set_current_state(TASK_RUNNING);
					yield();
				}
			}
		}
	}
}

static int cmp_hide_pages_init(void)
{
	if ( ! mxt_compression_enabled ) 
		return 0;

	printk("<MC> Starting kernel page reserving thread\n");

	kernel_thread(cmp_hide_pages, 0, 
		      CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
	return 0;
}

/* 
 * use before module unload 
 */
static void release_hidden_pages(void)
{
	printk("<MC> release_hidden_pages: enter\n");

	atomic_set(&nr_hide_pages, 0);
	wake_up_hide_pages();
	set_current_state(TASK_RUNNING);
	yield();
}
#else
static void release_hidden_pages(void) {;}
static int cmp_hide_pages_init(void) {;}
#endif

void wake_up_hide_pages(void)
{
#ifdef CONFIG_MXT_HIDDEN_PAGES
	if ( waitqueue_active(  &cmp_hide_pages_wait ) )
		wake_up_interruptible_all( &cmp_hide_pages_wait );
#endif
}


/* --------------------- Section 100 ------------------------------ */

/*
 *   Module initialization
 */
static const char copyrite[] = "MXT Driver, Copyright (C) 1999-2001, IBM";

static int __init mxt_init(void)
{
	int status;

	spin_lock_init(&fast_page_lock);
	spin_lock_init(&mc_lock);

	status = mxt_scan_bios();
	if ( status ) 
		return status;

	printk("%s\n", copyrite);
	mxt_setup_vm();
	mxt_register_sysctl();
	mxt_setup_interrupts();
	mxt_hook_clear_page();
	cmp_idle_init();
	cmp_eatmem_init();
	cmp_hide_pages_init();

	return 0;
}

static void __exit mxt_exit(void)
{
	printk("<MC> mxt_exit: enter\n");
	if ( ! mxt_bios_found )
		return;
	mxt_unregister_sysctl();
	mxt_unhook_clear_page();
	stop_periodic_task();
	mxt_disconnect_interrupts();
	release_held_pages();
	release_hidden_pages();
	cmp_release_cpus();
	mxt_cleanup_vm();
	printk("<MC> mxt_exit: exit\n");   
}

module_init(mxt_init);
module_exit(mxt_exit);

MODULE_DESCRIPTION("MXT Compressed Memory Manager");
MODULE_AUTHOR("Bulent Abali <abali@us.ibm.com>");
MODULE_PARM(mxt_printk, "i");
MODULE_PARM_DESC(mxt_printk, "Enable debug mode");
