/*
 * Implement the manual drop-all-pagecache function
 */

#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
#include <linux/vmstat.h>
#include <linux/swap.h>
#include <linux/mmzone.h>
#include <linux/timex.h>
#include "internal.h"

/* A global variable is a bit ugly, but it keeps the code simple */
int sysctl_drop_caches;

static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
	struct inode *inode, *toput_inode = NULL;

	spin_lock(&inode_sb_list_lock);
	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
		spin_lock(&inode->i_lock);
		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
		    (inode->i_mapping->nrpages == 0)) {
			spin_unlock(&inode->i_lock);
			continue;
		}
		__iget(inode);
		spin_unlock(&inode->i_lock);
		spin_unlock(&inode_sb_list_lock);
		invalidate_mapping_pages(inode->i_mapping, 0, -1);
		iput(toput_inode);
		toput_inode = inode;
		spin_lock(&inode_sb_list_lock);
	}
	spin_unlock(&inode_sb_list_lock);
	iput(toput_inode);
}

static void drop_slab(void)
{
	int nr_objects;
	struct shrink_control shrink = {
		.gfp_mask = GFP_KERNEL,
	};

	nodes_setall(shrink.nodes_to_scan);
	do {
		nr_objects = shrink_slab(&shrink, 1000, 1000);
	} while (nr_objects > 10);
}

int drop_caches_sysctl_handler(struct ctl_table *table, int write,
	void __user *buffer, size_t *length, loff_t *ppos)
{
	int ret;

	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
	if (ret)
		return ret;
	if (write) {
		static int stfu;

		if (sysctl_drop_caches & 1) {
			iterate_supers(drop_pagecache_sb, NULL);
			count_vm_event(DROP_PAGECACHE);
		}
		if (sysctl_drop_caches & 2) {
			drop_slab();
			count_vm_event(DROP_SLAB);
		}
		if (!stfu) {
			pr_info("%s (%d): drop_caches: %d\n",
				current->comm, task_pid_nr(current),
				sysctl_drop_caches);
		}
		stfu |= sysctl_drop_caches & 4;
	}
	return 0;
}

/* for sGW weak-drop-cache */
/*
 * This sysctl function aims at refraining from fragmentation caused by file cache.
 * So, purge *clean* file-cache at indicated rate.
 * 1) don't purge dirty-file-cache at any time.
 *    This will make an user process to sleep unintentionally.
 */
/* A global variable is a bit ugly, but it keeps the code simple */
int sgw_refrain_fcache_rate = 0;
static int debug_sgw_refrain_fcache = 0;
static int sgw_emergency_refrain_fcache = 0;
static int total_release_count = 0;
static int total_should_release = 0;

/* defined in mm/internal.h */
#define ZONE_RECLAIM_NOSCAN	-2
#define ZONE_RECLAIM_FULL	-1
#define ZONE_RECLAIM_SOME	0
#define ZONE_RECLAIM_SUCCESS	1

#define DEBUG_PRINT(a)              if(debug_sgw_refrain_fcache){printk(KERN_EMERG a);} 
#define DEBUG_PRINT1(a,b)           if(debug_sgw_refrain_fcache){printk(KERN_EMERG a,b);} 
#define DEBUG_PRINT2(a,b,c)         if(debug_sgw_refrain_fcache){printk(KERN_EMERG a,b,c);} 
#define DEBUG_PRINT3(a,b,c,d)       if(debug_sgw_refrain_fcache){printk(KERN_EMERG a,b,c,d);} 
#define DEBUG_PRINT4(a,b,c,d,e)     if(debug_sgw_refrain_fcache){printk(KERN_EMERG a,b,c,d,e);} 


/* */
static unsigned long get_purge_page_num(void)
{
    unsigned long rate;
    unsigned long nr, inact, act;
	struct zone *zone;

    if( sgw_refrain_fcache_rate == 0 ){
        DEBUG_PRINT("get_purge_page_num: sgw_refrain_fcache == 0\n");
        return 0;
    }

    /* get current file cache size */
    nr = 0;
	for_each_zone(zone){
        if(is_highmem(zone)){
            continue;
        }
        act = zone_page_state(zone, NR_ACTIVE_FILE);
	    inact = zone_page_state(zone, NR_INACTIVE_FILE);
        DEBUG_PRINT3("get_purge_page_num: zone:%s act:%ld inact:%ld\n", zone->name, act, inact);
        nr = nr + act + inact;
    }
    DEBUG_PRINT1("get_purge_page_num: nr: %ld\n", nr);

    /* don't purge too many cache on Non-emergency mode */
    if( !sgw_emergency_refrain_fcache && sgw_refrain_fcache_rate > 70 ){
        rate = 70;
        DEBUG_PRINT1("Non-emerg mode: upper rate is 70: %ld\n", rate);
    }
    else{
        rate = sgw_refrain_fcache_rate;
    }
    DEBUG_PRINT2("get_purge_page_num: rate: %ld sgw_refrain_fcach: %d\n", rate, sgw_refrain_fcache_rate);

    return (nr * rate) / 100;
}

/*
 * This code is based on drop_pagecache_sb()
 */
static void sgw_emerg_drop_pagecache(struct super_block *sb, void *unused)
{
	struct inode *inode, *toput_inode = NULL;
    unsigned long ret;
	pgoff_t end;
    unsigned long rest;

    if( total_release_count >= total_should_release ){
        DEBUG_PRINT("emerg route: end\n");
        return;
    }

    DEBUG_PRINT2("emerg route: release:%d should:%d\n", total_release_count, total_should_release);

	spin_lock(&inode_sb_list_lock);
    /* 
     * prevent us from sleeping on cond_resched() 
     * in invalidate_mapping_pages() 
     */
    preempt_disable();
	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
		spin_lock(&inode->i_lock);
		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
		    (inode->i_mapping->nrpages == 0)) {
			spin_unlock(&inode->i_lock);
			continue;
		}
		__iget(inode);
		spin_unlock(&inode->i_lock);
		spin_unlock(&inode_sb_list_lock);
    /* 
     * invalidate_mapping_pages() doesn't do any write-back.
     * (return value means the number of invalidated page)
     */
        rest = total_should_release - total_release_count;
        if( inode->i_mapping->nrpages > rest ){
            end = rest;
        }
        else{
            /* -1 means that want to invalidate all pages */
            end = -1;
        }
        DEBUG_PRINT1("emerg route: call invalidate_mapping_pages() end=%ld\n", end);
		ret = invalidate_mapping_pages(inode->i_mapping, 0, end);
		iput(toput_inode);
		toput_inode = inode;
        DEBUG_PRINT2("emerg route: count: %d ret: %ld\n", total_release_count, ret);
		spin_lock(&inode_sb_list_lock);
        total_release_count += ret;
        /* complete? */
        if( total_release_count >= total_should_release ){
            DEBUG_PRINT("emerg route break\n");
            break;
        }
	}
    preempt_enable();
	spin_unlock(&inode_sb_list_lock);
	iput(toput_inode);
}

/* 
 * inspired reclaim_clean_pages_from_list(), 
 * see mm/vmscan.c
 */
static int sgw_purge_clean_fcache(void)
{
    struct zone *zone;
    unsigned int o, order;
    int nr, ret;
    int count = 0;

    /* emergency! don't sleep, but busy! */
    if( sgw_emergency_refrain_fcache ){
        nr = get_purge_page_num();
        if( nr == 0 ){
            DEBUG_PRINT("return nr == 0 in emergency route\n");
            return 0;
        }
        DEBUG_PRINT1("emergency route:nr=%d\n", nr);
        total_release_count = 0;
        total_should_release = nr;
        DEBUG_PRINT2("goto iterate_supers(%d:%d)\n", sgw_refrain_fcache_rate,nr);
		iterate_supers(sgw_emerg_drop_pagecache, NULL);
    }
    /* use more mild means, but get possibility to sleep */
    else{
        nr = get_purge_page_num();
        if( nr == 0 ){
            DEBUG_PRINT("return nr == 0 in mild route\n");
            return 0;
        }
        DEBUG_PRINT1("mild route:nr=%d\n", nr);
        /* page num -> page order */
        order = fls(nr);
        order++;
	    for_each_zone(zone){
            DEBUG_PRINT2("zone %s: order:%d\n", zone->name, order);
            if(is_highmem(zone)){
                continue;
            }
            for( o = order; o > 0; o--){
            /* example of use zone_reclaim is contained in mm/page_alloc.c */
                ret = zone_reclaim(zone, GFP_KERNEL, o);
                DEBUG_PRINT3("zone %s: order:%d ret:%d\n", zone->name, o, ret);
                if( ret == ZONE_RECLAIM_SUCCESS	) {
                    count += (0x00000001 << o);
                    DEBUG_PRINT1("add count(%d)\n", count);
                }
                if( count > nr ){
                    DEBUG_PRINT("goto mild_purge_done\n");
                    goto mild_purge_done;
                }
            }
        }
    }
mild_purge_done:
    return 0;
}

static int __sgw_refrain_fcache(int rate, int write)
{
    unsigned long start, end;

    /*
     * rate: means [(purge file cache num / current file cache num) * 100]
     *   0 -  99: mild mode(may sleep)
     * 100 - 199: emergency mode(not sleep)
     * 200 - 299: mild mode + debug(print logs)
     * 300 - 399: emergency mode + debug(print logs)
     */
    sgw_refrain_fcache_rate = rate;
    debug_sgw_refrain_fcache = 0;
    sgw_emergency_refrain_fcache = 0;

    /* param is out of range */
    if( 0 > sgw_refrain_fcache_rate || sgw_refrain_fcache_rate >= 400 ){
        sgw_refrain_fcache_rate = 0;
        return -EINVAL;
    }
    /* debug mode */
    if( 200 <= sgw_refrain_fcache_rate && sgw_refrain_fcache_rate < 400 ){
        debug_sgw_refrain_fcache = 1;
        sgw_refrain_fcache_rate -= 200;
    }
    /* emergency mode */
    if( 100 <= sgw_refrain_fcache_rate && sgw_refrain_fcache_rate < 200 ){
        sgw_emergency_refrain_fcache = 1;
        sgw_refrain_fcache_rate -= 100;
    }

    if( write ){
        start = get_cycles();
        sgw_purge_clean_fcache();
        end = get_cycles();
        DEBUG_PRINT3(KERN_EMERG "%s start: 0x%lx end: 0x%lx\n", __func__, start, end);
    }
    return 0;
}

int sgw_refrain_fcache(int rate)
{
    return __sgw_refrain_fcache(rate, 1);
}
EXPORT_SYMBOL_GPL(sgw_refrain_fcache);

int sgw_refrain_fcache_handler(struct ctl_table *table, int write,
	void __user *buffer, size_t *length, loff_t *ppos)
{
	int ret;

    /* ascii -> int. get ascii-argument through procfs */
	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
	if (ret){
		return ret;
    }

    return __sgw_refrain_fcache(sgw_refrain_fcache_rate, write);
}

