From f5609896320ac4fee6a69870bb844bdbea74903f Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Mon, 15 Dec 2014 13:25:39 +0200 Subject: [PATCH 001/420] BACKPORT: nl80211: Stop scheduled scan if netlink client disappears (cherry pick from commit 93a1e86ce10e4898f9ca9cd09d659a8a7780ee5e) An attribute NL80211_ATTR_SOCKET_OWNER can be set by the scan initiator. If present, the attribute will cause the scan to be stopped if the client dies. Signed-off-by: Jukka Rissanen Signed-off-by: Johannes Berg Bug: 25561044 Change-Id: I2343e494d2f4b4154ce945b61b97a68384fa92c1 --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 5 ++++- net/wireless/core.c | 16 ++++++++++++++++ net/wireless/core.h | 2 ++ net/wireless/nl80211.c | 16 ++++++++++++++++ 5 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a2ddcf2398fdad..4bbf53b6b8c36d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1465,6 +1465,8 @@ struct cfg80211_match_set { * @channels: channels to scan * @min_rssi_thold: for drivers only supporting a single threshold, this * contains the minimum over all matchsets + * @owner_nlportid: netlink portid of owner (if this should is a request + * owned by a particular socket) */ struct cfg80211_sched_scan_request { struct cfg80211_ssid *ssids; @@ -1483,6 +1485,7 @@ struct cfg80211_sched_scan_request { struct wiphy *wiphy; struct net_device *dev; unsigned long scan_start; + u32 owner_nlportid; /* keep last */ struct ieee80211_channel *channels[0]; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4b28dc07bcb1fd..ed0bcc82fad18a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1608,7 +1608,10 @@ enum nl80211_commands { * * @NL80211_ATTR_IFACE_SOCKET_OWNER: flag attribute, if set during interface * creation then the new interface will be owned by the netlink socket - * that created it and will be destroyed when the socket is closed + * that created it and will be destroyed when the socket is closed. + * If set during scheduled scan start then the new scan req will be + * owned by the netlink socket that created it and the scheduled scan will + * be stopped when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/core.c b/net/wireless/core.c index f52a4cd7017c85..9d2d8923f10fb9 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -307,6 +307,20 @@ static void cfg80211_destroy_iface_wk(struct work_struct *work) rtnl_unlock(); } +static void cfg80211_sched_scan_stop_wk(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, + sched_scan_stop_wk); + + rtnl_lock(); + + __cfg80211_stop_sched_scan(rdev, false); + + rtnl_unlock(); +} + /* exported functions */ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) @@ -368,6 +382,7 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) INIT_LIST_HEAD(&rdev->destroy_list); spin_lock_init(&rdev->destroy_list_lock); INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); + INIT_WORK(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -722,6 +737,7 @@ void wiphy_unregister(struct wiphy *wiphy) flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); + flush_work(&rdev->sched_scan_stop_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) diff --git a/net/wireless/core.h b/net/wireless/core.h index 7e3a3cef7df93b..8177b2d9c3f16c 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -84,6 +84,8 @@ struct cfg80211_registered_device { struct list_head destroy_list; struct work_struct destroy_work; + struct work_struct sched_scan_stop_wk; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5839c85075f154..024bf7b3d29618 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5835,6 +5835,9 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, err = rdev_sched_scan_start(rdev, dev, request); if (!err) { + if (info->attrs[NL80211_ATTR_IFACE_SOCKET_OWNER]) + request->owner_nlportid = info->snd_portid; + rdev->sched_scan_req = request; nl80211_send_sched_scan(rdev, dev, NL80211_CMD_START_SCHED_SCAN); @@ -11949,6 +11952,13 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { bool schedule_destroy_work = false; + bool schedule_scan_stop = false; + struct cfg80211_sched_scan_request *sched_scan_req = + rcu_dereference(rdev->sched_scan_req); + + if (sched_scan_req && notify->portid && + sched_scan_req->owner_nlportid == notify->portid) + schedule_scan_stop = true; list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); @@ -11979,6 +11989,12 @@ static int nl80211_netlink_notify(struct notifier_block * nb, spin_unlock(&rdev->destroy_list_lock); schedule_work(&rdev->destroy_work); } + } else if (schedule_scan_stop) { + sched_scan_req->owner_nlportid = 0; + + if (rdev->ops->sched_scan_stop && + rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + schedule_work(&rdev->sched_scan_stop_wk); } } From a5450400a5e9230cf23639aad92dea54f32a527d Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Mon, 21 Sep 2015 17:59:37 +0800 Subject: [PATCH 002/420] CHROMIUM: usb: gadget: f_mtp: Declare unused functions as maybe_unused Without this, we get these 2 warnings: v3.18/drivers/usb/gadget/function/f_mtp.c:1289:12: error: 'mtp_bind_config' defined but not used [-Werror=unused-function] static int mtp_bind_config(struct usb_configuration *c, bool ptp_config) ^ v3.18/drivers/usb/gadget/function/f_mtp.c:1372:12: error: 'mtp_setup' defined but not used [-Werror=unused-function] static int mtp_setup(void) ^ cc1: all warnings being treated as errors Arguably, the functions could be removed, but it looks like the intent of drivers/usb/gadget/android.c is to include these, so leave them be. BUG=chromium:534254 TEST=emerge-smaug chromeos-kernel-3_18 Change-Id: Ie1b10d91b69b75ab2543e939e51fa9d74a64b950 Signed-off-by: Nicolas Boichat Reviewed-on: https://chromium-review.googlesource.com/300705 Reviewed-by: Benson Leung --- drivers/usb/gadget/function/f_mtp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 436c12c1d881f4..8dea1b3c278dcf 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -1303,7 +1303,8 @@ static void mtp_function_disable(struct usb_function *f) VDBG(cdev, "%s disabled\n", dev->function.name); } -static int mtp_bind_config(struct usb_configuration *c, bool ptp_config) +static __maybe_unused int mtp_bind_config(struct usb_configuration *c, + bool ptp_config) { struct mtp_dev *dev = _mtp_dev; int ret = 0; @@ -1386,7 +1387,7 @@ static int __mtp_setup(struct mtp_instance *fi_mtp) return ret; } -static int mtp_setup(void) +static __maybe_unused int mtp_setup(void) { return __mtp_setup(NULL); } From dd5d4309cc78b375592a8e1ca4914ee3906df0d0 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 21 Mar 2016 10:55:23 -0700 Subject: [PATCH 003/420] ANDROID: dm: Mounting root as linear device when verity disabled This CL makes android-verity target to be added as linear dm device if when bootloader is unlocked and verity is disabled. Bug: 27175947 Change-Id: Ic41ca4b8908fb2777263799cf3a3e25934d70f18 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 128 +++++++++++++++++++++++++++------ drivers/md/dm-android-verity.h | 5 ++ drivers/md/dm-linear.c | 2 +- 3 files changed, 112 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index aeb5045830d960..f6ddbee5e2d356 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -43,6 +44,25 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static bool target_added; +static bool verity_enabled = true; +struct dentry *debug_dir; +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv); + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + static int __init verified_boot_state_param(char *line) { strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); @@ -549,6 +569,32 @@ static inline bool test_mult_overflow(sector_t a, u32 b) return a > r; } +static int add_as_linear_device(struct dm_target *ti, char *dev) +{ + /*Move to linear mapping defines*/ + char *linear_table_args[DM_LINEAR_ARGS] = {dev, + DM_LINEAR_TARGET_OFFSET}; + int err = 0; + + android_verity_target.dtr = linear_target.dtr, + android_verity_target.map = linear_target.map, + android_verity_target.status = linear_target.status, + android_verity_target.ioctl = linear_target.ioctl, + android_verity_target.merge = linear_target.merge, + android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.io_hints = NULL; + + err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + + if (!err) { + DMINFO("Added android-verity as a linear target"); + target_added = true; + } else + DMERR("Failed to add android-verity as linear target"); + + return err; +} + /* * Target parameters: * Key id of the public key in the system keyring. @@ -613,21 +659,27 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (err == VERITY_STATE_DISABLE) { DMERR("Mounting root with verity disabled"); - return -EINVAL; + verity_enabled = false; + /* we would still have to parse the args to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + */ } else if (err) { DMERR("Verity header handle error"); handle_error(); goto free_metadata; } - err = verify_verity_signature(key_id, metadata); + if (!verity_enabled) { + err = verify_verity_signature(key_id, metadata); - if (err) { - DMERR("Signature verification failed"); - handle_error(); - goto free_metadata; - } else - DMINFO("Signature verification success"); + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + } table_ptr = metadata->verity_table; @@ -683,6 +735,12 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* update target length */ ti->len = data_sectors; + /* Setup linear target and free */ + if (!verity_enabled) { + err = add_as_linear_device(ti, argv[1]); + goto free_metadata; + } + /*substitute data_dev and hash_dev*/ verity_table_args[1] = argv[1]; verity_table_args[2] = argv[1]; @@ -730,6 +788,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) err = verity_ctr(ti, no_of_args, verity_table_args); + if (err) + DMERR("android-verity failed to mount as verity target"); + else { + target_added = true; + DMINFO("android-verity mounted as verity target"); + } + free_metadata: kfree(metadata->header); kfree(metadata->verity_table); @@ -737,33 +802,52 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return err; } -static struct target_type android_verity_target = { - .name = "android-verity", - .version = {1, 0, 0}, - .module = THIS_MODULE, - .ctr = android_verity_ctr, - .dtr = verity_dtr, - .map = verity_map, - .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, - .iterate_devices = verity_iterate_devices, - .io_hints = verity_io_hints, -}; - static int __init dm_android_verity_init(void) { int r; + struct dentry *file; r = dm_register_target(&android_verity_target); if (r < 0) DMERR("register failed %d", r); + /* Tracks the status of the last added target */ + debug_dir = debugfs_create_dir("android_verity", NULL); + + if (IS_ERR_OR_NULL(debug_dir)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + goto end; + } + + file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, + (u32 *)&target_added); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + goto end; + } + + file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, + (u32 *)&verity_enabled); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + } + +end: return r; } static void __exit dm_android_verity_exit(void) { + if (!IS_ERR_OR_NULL(debug_dir)) + debugfs_remove_recursive(debug_dir); + dm_unregister_target(&android_verity_target); } diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 11477ffd2243e5..fe53863c664b46 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -44,6 +44,10 @@ #define VERITY_DEBUG 0 #define DM_MSG_PREFIX "android-verity" + +#define DM_LINEAR_ARGS 2 +#define DM_LINEAR_TARGET_OFFSET "0" + /* * There can be two formats. * if fec is present @@ -89,4 +93,5 @@ struct bio_read { int number_of_pages; }; +extern struct target_type linear_target; #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 53e848c1093936..73285561627a64 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -153,7 +153,7 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -static struct target_type linear_target = { +struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, From 7a3b0016087499ded89983eb670ca57d5c502ba2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 28 Mar 2016 15:00:20 -0700 Subject: [PATCH 004/420] sdcardfs: Remove unused code Change-Id: Ie97cba27ce44818ac56cfe40954f164ad44eccf6 --- fs/sdcardfs/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index fa11a0458b8441..a6522286d7314b 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -54,7 +54,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, char *p; substring_t args[MAX_OPT_ARGS]; int option; - char *string_option; /* by default, we use AID_MEDIA_RW as uid, gid */ opts->fs_low_uid = AID_MEDIA_RW; @@ -117,7 +116,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, break; /* unknown option */ default: -invalid_option: if (!silent) { printk( KERN_ERR "Unrecognized mount option \"%s\" " "or missing value", p); From 7149555b52f14cab21e2bbb966ca13a103a859fd Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 28 Mar 2016 16:00:34 -0700 Subject: [PATCH 005/420] sdcardfs: remove unneeded __init and __exit Change-Id: I2a2d45d52f891332174c3000e8681c5167c1564f --- fs/sdcardfs/packagelist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 368f257663065f..fead71eac95f29 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -395,7 +395,7 @@ static struct configfs_subsystem sdcardfs_packages_subsys = { }, }; -static int __init configfs_sdcardfs_init(void) +static int configfs_sdcardfs_init(void) { int ret; struct configfs_subsystem *subsys = &sdcardfs_packages_subsys; @@ -411,7 +411,7 @@ static int __init configfs_sdcardfs_init(void) return ret; } -static void __exit configfs_sdcardfs_exit(void) +static void configfs_sdcardfs_exit(void) { configfs_unregister_subsystem(&sdcardfs_packages_subsys); } From ec61069bb2d5e07515d670736bc4bcdb651896b1 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 30 Mar 2016 14:10:13 -0700 Subject: [PATCH 006/420] ANDROID: dm verity fec: add sysfs attribute fec/corrected Add a sysfs entry that allows user space to determine whether dm-verity has come across correctable errors on the underlying block device. Bug: 22655252 Bug: 27928374 Change-Id: I80547a2aa944af2fb9ffde002650482877ade31b Signed-off-by: Sami Tolvanen (cherry picked from commit 7911fad5f0a2cf5afc2215657219a21e6630e001) --- drivers/md/dm-verity-fec.c | 45 +++++++++++++++++++++++++++++++++++++- drivers/md/dm-verity-fec.h | 3 +++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1cc10c4de70101..ad10d6d8ed28dd 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -11,6 +11,7 @@ #include "dm-verity-fec.h" #include +#include #define DM_MSG_PREFIX "verity-fec" @@ -175,9 +176,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); - else if (r > 0) + else if (r > 0) { DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", v->data_dev->name, (unsigned long long)rsb, r); + atomic_add_unless(&v->fec->corrected, 1, INT_MAX); + } return r; } @@ -548,6 +551,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, void verity_fec_dtr(struct dm_verity *v) { struct dm_verity_fec *f = v->fec; + struct kobject *kobj = &f->kobj_holder.kobj; if (!verity_fec_is_enabled(v)) goto out; @@ -564,6 +568,12 @@ void verity_fec_dtr(struct dm_verity *v) if (f->dev) dm_put_device(v->ti, f->dev); + + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + out: kfree(f); v->fec = NULL; @@ -652,6 +662,27 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return 0; } +static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec, + kobj_holder.kobj); + + return sprintf(buf, "%d\n", atomic_read(&f->corrected)); +} + +static struct kobj_attribute attr_corrected = __ATTR_RO(corrected); + +static struct attribute *fec_attrs[] = { + &attr_corrected.attr, + NULL +}; + +static struct kobj_type fec_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = fec_attrs +}; + /* * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. */ @@ -675,8 +706,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v) */ int verity_fec_ctr(struct dm_verity *v) { + int r; struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; + struct mapped_device *md = dm_table_get_md(ti->table); u64 hash_blocks; if (!verity_fec_is_enabled(v)) { @@ -684,6 +717,16 @@ int verity_fec_ctr(struct dm_verity *v) return 0; } + /* Create a kobject and sysfs attributes */ + init_completion(&f->kobj_holder.completion); + + r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", "fec"); + if (r) { + ti->error = "Cannot create kobject"; + return r; + } + /* * FEC is computed over data blocks, possible metadata, and * hash blocks. In other words, FEC covers total of fec_blocks diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 7fa0298b995e9e..8c4bee052a7354 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -12,6 +12,7 @@ #ifndef DM_VERITY_FEC_H #define DM_VERITY_FEC_H +#include "dm.h" #include "dm-verity.h" #include @@ -48,6 +49,8 @@ struct dm_verity_fec { mempool_t *extra_pool; /* mempool for extra buffers */ mempool_t *output_pool; /* mempool for output */ struct kmem_cache *cache; /* cache for buffers */ + atomic_t corrected; /* corrected errors */ + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ }; /* per-bio data */ From d207a9861dc3e25c51a55595dbeee0bb58ab24f1 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Thu, 28 Jan 2016 11:12:25 -0800 Subject: [PATCH 007/420] ANDROID: mmc: Add CONFIG_MMC_SIMULATE_MAX_SPEED When CONFIG_MMC_SIMULATE_MAX_SPEED is enabled, Expose max_read_speed, max_write_speed and cache_size default module parameters and sysfs controls to simulate a slow eMMC device. Default values are 0 (off), 0 (off) and 4 MB respectively. Signed-off-by: Mark Salyzyn Bug: 26976972 Change-Id: I342bfbd8b85f9b790e3f0e1e4e51a900ae07e05d --- Documentation/block/00-INDEX | 6 + Documentation/block/mmc-max-speed.txt | 38 ++++ drivers/mmc/card/Kconfig | 12 ++ drivers/mmc/card/block.c | 300 ++++++++++++++++++++++++++ drivers/mmc/card/queue.h | 8 + 5 files changed, 364 insertions(+) create mode 100644 Documentation/block/mmc-max-speed.txt diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index e840b47613f78f..bc5148757edb5d 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -26,3 +26,9 @@ switching-sched.txt - Switching I/O schedulers at runtime writeback_cache_control.txt - Control of volatile write back caches +mmc-max-speed.txt + - eMMC layer speed simulation, related to /sys/block/mmcblk*/ + attributes: + max_read_speed + max_write_speed + cache_size diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt new file mode 100644 index 00000000000000..3f052b9fb99996 --- /dev/null +++ b/Documentation/block/mmc-max-speed.txt @@ -0,0 +1,38 @@ +eMMC Block layer simulation speed controls in /sys/block/mmcblk*/ +=============================================== + +Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed +limiting. Used to test and simulate the behavior of the system when +confronted with a slow MMC. + +Enables max_read_speed, max_write_speed and cache_size attributes and module +default parameters to control the write or read maximum KB/second speed +behaviors. + +NB: There is room for improving the algorithm for aspects tied directly to +eMMC specific behavior. For instance, wear leveling and stalls from an +exhausted erase pool. We would expect that if there was a need to provide +similar speed simulation controls to other types of block devices, aspects of +their behavior are modelled separately (e.g. head seek times, heat assist, +shingling and rotational latency). + +/sys/block/mmcblk0/max_read_speed: + +Number of KB/second reads allowed to the block device. Used to test and +simulate the behavior of the system when confronted with a slow reading MMC. +Set to 0 or "off" to place no speed limit. + +/sys/block/mmcblk0/max_write_speed: + +Number of KB/second writes allowed to the block device. Used to test and +simulate the behavior of the system when confronted with a slow writing MMC. +Set to 0 or "off" to place no speed limit. + +/sys/block/mmcblk0/cache_size: + +Number of MB of high speed memory or high speed SLC cache expected on the +eMMC device being simulated. Used to help simulate the write-back behavior +more accurately. The assumption is the cache has no delay, but draws down +in the background to the MLC/TLC primary store at the max_write_speed rate. +Any write speed delays will show up when the cache is full, or when an I/O +request to flush is issued. diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig index 5562308699bc29..6142ec1b9dfbbc 100644 --- a/drivers/mmc/card/Kconfig +++ b/drivers/mmc/card/Kconfig @@ -68,3 +68,15 @@ config MMC_TEST This driver is only of interest to those developing or testing a host driver. Most people should say N here. + +config MMC_SIMULATE_MAX_SPEED + bool "Turn on maximum speed control per block device" + depends on MMC_BLOCK + help + Say Y here to enable MMC device speed limiting. Used to test and + simulate the behavior of the system when confronted with a slow MMC. + + Enables max_read_speed, max_write_speed and cache_size attributes to + control the write or read maximum KB/second speed behaviors. + + If unsure, say N here. diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 30a4ac58e8c824..7b3152d621fbd0 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -285,6 +285,250 @@ static ssize_t force_ro_store(struct device *dev, struct device_attribute *attr, return ret; } +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + +static int max_read_speed, max_write_speed, cache_size = 4; + +module_param(max_read_speed, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off"); +module_param(max_write_speed, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off"); +module_param(cache_size, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache"); + +/* + * helper macros and expectations: + * size - unsigned long number of bytes + * jiffies - unsigned long HZ timestamp difference + * speed - unsigned KB/s transfer rate + */ +#define size_and_speed_to_jiffies(size, speed) \ + ((size) * HZ / (speed) / 1024UL) +#define jiffies_and_speed_to_size(jiffies, speed) \ + (((speed) * (jiffies) * 1024UL) / HZ) +#define jiffies_and_size_to_speed(jiffies, size) \ + ((size) * HZ / (jiffies) / 1024UL) + +/* Limits to report warning */ +/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */ +#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */ +#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL) + +#define speed_valid(speed) ((speed) > 0) + +static const char off[] = "off\n"; + +static int max_speed_show(int speed, char *buf) +{ + if (speed) + return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed); + else + return scnprintf(buf, PAGE_SIZE, off); +} + +static int max_speed_store(const char *buf, struct request_queue *q) +{ + unsigned int limit, set = 0; + + if (!strncasecmp(off, buf, sizeof(off) - 2)) + return set; + if (kstrtouint(buf, 0, &set) || (set > INT_MAX)) + return -EINVAL; + if (set == 0) + return set; + limit = MAX_SPEED(q); + if (set > limit) + pr_warn("max speed %u ineffective above %u\n", set, limit); + limit = MIN_SPEED(q); + if (set < limit) + pr_warn("max speed %u painful below %u\n", set, limit); + return set; +} + +static ssize_t max_write_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf); + + mmc_blk_put(md); + return ret; +} + +static ssize_t max_write_speed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int set = max_speed_store(buf, md->queue.queue); + + if (set < 0) { + mmc_blk_put(md); + return set; + } + + atomic_set(&md->queue.max_write_speed, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR, + max_write_speed_show, max_write_speed_store); + +static ssize_t max_read_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf); + + mmc_blk_put(md); + return ret; +} + +static ssize_t max_read_speed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int set = max_speed_store(buf, md->queue.queue); + + if (set < 0) { + mmc_blk_put(md); + return set; + } + + atomic_set(&md->queue.max_read_speed, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR, + max_read_speed_show, max_read_speed_store); + +static ssize_t cache_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + struct mmc_queue *mq = &md->queue; + int cache_size = atomic_read(&mq->cache_size); + int ret; + + if (!cache_size) + ret = scnprintf(buf, PAGE_SIZE, off); + else { + int speed = atomic_read(&mq->max_write_speed); + + if (!speed_valid(speed)) + ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size); + else { /* We accept race between cache_jiffies and cache_used */ + unsigned long size = jiffies_and_speed_to_size( + jiffies - mq->cache_jiffies, speed); + long used = atomic_long_read(&mq->cache_used); + + if (size >= used) + size = 0; + else + size = (used - size) * 100 / cache_size + / 1024UL / 1024UL; + + ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n", + cache_size, size); + } + } + + mmc_blk_put(md); + return ret; +} + +static ssize_t cache_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md; + unsigned int set = 0; + + if (strncasecmp(off, buf, sizeof(off) - 2) + && (kstrtouint(buf, 0, &set) || (set > INT_MAX))) + return -EINVAL; + + md = mmc_blk_get(dev_to_disk(dev)); + atomic_set(&md->queue.cache_size, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR, + cache_size_show, cache_size_store); + +/* correct for write-back */ +static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor) +{ + long used = 0; + int speed = atomic_read(&mq->max_write_speed); + + if (speed_valid(speed)) { + unsigned long size = jiffies_and_speed_to_size( + waitfor - mq->cache_jiffies, speed); + used = atomic_long_read(&mq->cache_used); + + if (size >= used) + used = 0; + else + used -= size; + } + + atomic_long_set(&mq->cache_used, used); + mq->cache_jiffies = waitfor; + + return used; +} + +static void mmc_blk_simulate_delay( + struct mmc_queue *mq, + struct request *req, + unsigned long waitfor) +{ + int max_speed; + + if (!req) + return; + + max_speed = (rq_data_dir(req) == READ) + ? atomic_read(&mq->max_read_speed) + : atomic_read(&mq->max_write_speed); + if (speed_valid(max_speed)) { + unsigned long bytes = blk_rq_bytes(req); + + if (rq_data_dir(req) != READ) { + int cache_size = atomic_read(&mq->cache_size); + + if (cache_size) { + unsigned long size = cache_size * 1024L * 1024L; + long used = mmc_blk_cache_used(mq, waitfor); + + used += bytes; + atomic_long_set(&mq->cache_used, used); + bytes = 0; + if (used > size) + bytes = used - size; + } + } + waitfor += size_and_speed_to_jiffies(bytes, max_speed); + if (time_is_after_jiffies(waitfor)) { + long msecs = jiffies_to_msecs(waitfor - jiffies); + + if (likely(msecs > 0)) + msleep(msecs); + } + } +} + +#else + +#define mmc_blk_simulate_delay(mq, req, waitfor) + +#endif + static int mmc_blk_open(struct block_device *bdev, fmode_t mode) { struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); @@ -1150,6 +1394,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) if (ret) ret = -EIO; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + else if (atomic_read(&mq->cache_size)) { + long used = mmc_blk_cache_used(mq, jiffies); + + if (used) { + int speed = atomic_read(&mq->max_write_speed); + + if (speed_valid(speed)) { + unsigned long msecs = jiffies_to_msecs( + size_and_speed_to_jiffies( + used, speed)); + if (msecs) + msleep(msecs); + } + } + } +#endif blk_end_request_all(req, ret); return ret ? 0 : 1; @@ -1835,6 +2096,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) struct mmc_async_req *areq; const u8 packed_nr = 2; u8 reqs = 0; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + unsigned long waitfor = jiffies; +#endif if (!rqc && !mq->mqrq_prev->req) return 0; @@ -1885,6 +2149,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) */ mmc_blk_reset_success(md, type); + mmc_blk_simulate_delay(mq, rqc, waitfor); + if (mmc_packed_cmd(mq_rq->cmd_type)) { ret = mmc_blk_end_packed_req(mq_rq); break; @@ -2300,6 +2566,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md) card->ext_csd.boot_ro_lockable) device_remove_file(disk_to_dev(md->disk), &md->power_ro_lock); +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + device_remove_file(disk_to_dev(md->disk), + &dev_attr_max_write_speed); + device_remove_file(disk_to_dev(md->disk), + &dev_attr_max_read_speed); + device_remove_file(disk_to_dev(md->disk), + &dev_attr_cache_size); +#endif del_gendisk(md->disk); } @@ -2335,6 +2609,24 @@ static int mmc_add_disk(struct mmc_blk_data *md) ret = device_create_file(disk_to_dev(md->disk), &md->force_ro); if (ret) goto force_ro_fail; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + atomic_set(&md->queue.max_write_speed, max_write_speed); + ret = device_create_file(disk_to_dev(md->disk), + &dev_attr_max_write_speed); + if (ret) + goto max_write_speed_fail; + atomic_set(&md->queue.max_read_speed, max_read_speed); + ret = device_create_file(disk_to_dev(md->disk), + &dev_attr_max_read_speed); + if (ret) + goto max_read_speed_fail; + atomic_set(&md->queue.cache_size, cache_size); + atomic_long_set(&md->queue.cache_used, 0); + md->queue.cache_jiffies = jiffies; + ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size); + if (ret) + goto cache_size_fail; +#endif if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && card->ext_csd.boot_ro_lockable) { @@ -2359,6 +2651,14 @@ static int mmc_add_disk(struct mmc_blk_data *md) return ret; power_ro_lock_fail: +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size); +cache_size_fail: + device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed); +max_read_speed_fail: + device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed); +max_write_speed_fail: +#endif device_remove_file(disk_to_dev(md->disk), &md->force_ro); force_ro_fail: del_gendisk(md->disk); diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h index 5752d50049a34c..e78dffc53c96c3 100644 --- a/drivers/mmc/card/queue.h +++ b/drivers/mmc/card/queue.h @@ -57,6 +57,14 @@ struct mmc_queue { struct mmc_queue_req mqrq[2]; struct mmc_queue_req *mqrq_cur; struct mmc_queue_req *mqrq_prev; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + atomic_t max_write_speed; + atomic_t max_read_speed; + atomic_t cache_size; + /* i/o tracking */ + atomic_long_t cache_used; + unsigned long cache_jiffies; +#endif }; extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, From a6d1b091f40b25d97849487e29ec097bc5f568dd Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 5 Apr 2016 11:18:16 -0700 Subject: [PATCH 008/420] ANDROID: dm: rename dm-linear methods for dm-android-verity This keeps linear_target as static variable and just exposes the linear target methods for android-verity Cherry-picked: https://android-review.googlesource.com/#/c/212858 Change-Id: I4a377e417b00afd9ecccdb3e605fea31a7df112e Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 14 +++++++------- drivers/md/dm-android-verity.h | 12 ++++++++++++ drivers/md/dm-linear.c | 30 +++++++++++++++--------------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index f6ddbee5e2d356..b7e059595f75d0 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -576,15 +576,15 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) DM_LINEAR_TARGET_OFFSET}; int err = 0; - android_verity_target.dtr = linear_target.dtr, - android_verity_target.map = linear_target.map, - android_verity_target.status = linear_target.status, - android_verity_target.ioctl = linear_target.ioctl, - android_verity_target.merge = linear_target.merge, - android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.dtr = dm_linear_dtr, + android_verity_target.map = dm_linear_map, + android_verity_target.status = dm_linear_status, + android_verity_target.ioctl = dm_linear_ioctl, + android_verity_target.merge = dm_linear_merge, + android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; - err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args); if (!err) { DMINFO("Added android-verity as a linear target"); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index fe53863c664b46..efb7965248968c 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -94,4 +94,16 @@ struct bio_read { }; extern struct target_type linear_target; + +extern void dm_linear_dtr(struct dm_target *ti); +extern int dm_linear_map(struct dm_target *ti, struct bio *bio); +extern void dm_linear_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg); +extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int dm_linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 73285561627a64..d42f1093ad60fc 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -25,7 +25,7 @@ struct linear_c { /* * Construct a linear mapping: */ -static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct linear_c *lc; unsigned long long tmp; @@ -64,7 +64,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } -static void linear_dtr(struct dm_target *ti) +void dm_linear_dtr(struct dm_target *ti) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -89,14 +89,14 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) linear_map_sector(ti, bio->bi_iter.bi_sector); } -static int linear_map(struct dm_target *ti, struct bio *bio) +int dm_linear_map(struct dm_target *ti, struct bio *bio) { linear_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } -static void linear_status(struct dm_target *ti, status_type_t type, +void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -113,7 +113,7 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_ioctl(struct dm_target *ti, unsigned int cmd, +int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -130,7 +130,7 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd, return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); } -static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, +int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, struct bio_vec *biovec, int max_size) { struct linear_c *lc = ti->private; @@ -145,7 +145,7 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -static int linear_iterate_devices(struct dm_target *ti, +int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct linear_c *lc = ti->private; @@ -153,17 +153,17 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -struct target_type linear_target = { +static struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, - .ctr = linear_ctr, - .dtr = linear_dtr, - .map = linear_map, - .status = linear_status, - .ioctl = linear_ioctl, - .merge = linear_merge, - .iterate_devices = linear_iterate_devices, + .ctr = dm_linear_ctr, + .dtr = dm_linear_dtr, + .map = dm_linear_map, + .status = dm_linear_status, + .ioctl = dm_linear_ioctl, + .merge = dm_linear_merge, + .iterate_devices = dm_linear_iterate_devices, }; int __init dm_linear_init(void) From db273e68b4fbc11f2c7e36aea2d1ac8ff2fb54de Mon Sep 17 00:00:00 2001 From: Rom Lemarchand Date: Thu, 7 Apr 2016 07:19:34 -0700 Subject: [PATCH 009/420] android: base-cfg: enable CONFIG_QUOTA Bug: 28032718 Change-Id: I7cb6b641f72085e69b90dca11d2ea68adcd02390 (cherry picked from commit e1b53a388e9cfcf870520a6899a37456cf1ae2c6) --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 2e4756c83e2555..85e4a932f5ef67 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -137,6 +137,7 @@ CONFIG_PPP_BSDCOMP=y CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y +CONFIG_QUOTA=y CONFIG_RESOURCE_COUNTERS=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y From 93790618c1050fa103a97fe04b0d665e060ca9ff Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Tue, 5 Apr 2016 13:06:27 -0700 Subject: [PATCH 010/420] BACKPORT: selinux: restrict kernel module loading Backport notes: Backport uses kernel_module_from_file not kernel_read_file hook. kernel_read_file replaced kernel_module_from_file in the 4.6 kernel. There are no inode_security_() helper functions (also introduced in 4.6) so the inode lookup is done using the file_inode() helper which is standard for kernel version < 4.6. (Cherry picked from commit 61d612ea731e57dc510472fb746b55cdc017f371) Utilize existing kernel_read_file hook on kernel module load. Add module_load permission to the system class. Enforces restrictions on kernel module origin when calling the finit_module syscall. The hook checks that source type has permission module_load for the target type. Example for finit_module: allow foo bar_file:system module_load; Similarly restrictions are enforced on kernel module loading when calling the init_module syscall. The hook checks that source type has permission module_load with itself as the target object because the kernel module is sourced from the calling process. Example for init_module: allow foo foo:system module_load; Bug: 27824855 Change-Id: I64bf3bd1ab2dc735321160642dc6bbfa996f8068 Signed-off-by: Jeff Vander Stoep Signed-off-by: Paul Moore --- security/selinux/hooks.c | 33 +++++++++++++++++++++++++++++ security/selinux/include/classmap.h | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 16d0ebb948435b..670a9b35eab4b3 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3663,6 +3663,38 @@ static int selinux_kernel_module_request(char *kmod_name) SYSTEM__MODULE_REQUEST, &ad); } +static int selinux_kernel_module_from_file(struct file *file) +{ + struct common_audit_data ad; + struct inode_security_struct *isec; + struct file_security_struct *fsec; + struct inode *inode; + u32 sid = current_sid(); + int rc; + + /* init_module */ + if (file == NULL) + return avc_has_perm(sid, sid, SECCLASS_SYSTEM, + SYSTEM__MODULE_LOAD, NULL); + + /* finit_module */ + ad.type = LSM_AUDIT_DATA_PATH; + ad.u.path = file->f_path; + + inode = file_inode(file); + isec = inode->i_security; + fsec = file->f_security; + + if (sid != fsec->sid) { + rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); + if (rc) + return rc; + } + + return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, + SYSTEM__MODULE_LOAD, &ad); +} + static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return current_has_perm(p, PROCESS__SETPGID); @@ -5990,6 +6022,7 @@ static struct security_operations selinux_ops = { .kernel_act_as = selinux_kernel_act_as, .kernel_create_files_as = selinux_kernel_create_files_as, .kernel_module_request = selinux_kernel_module_request, + .kernel_module_from_file = selinux_kernel_module_from_file, .task_setpgid = selinux_task_setpgid, .task_getpgid = selinux_task_getpgid, .task_getsid = selinux_task_getsid, diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index bdc9325a465910..3c7b236005e896 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -32,7 +32,7 @@ struct security_class_mapping secclass_map[] = { "setsockcreate", NULL } }, { "system", { "ipc_info", "syslog_read", "syslog_mod", - "syslog_console", "module_request", NULL } }, + "syslog_console", "module_request", "module_load", NULL } }, { "capability", { "chown", "dac_override", "dac_read_search", "fowner", "fsetid", "kill", "setgid", "setuid", "setpcap", From c37b1603e6beff9a831f0470c2cc4a1522f4fdfb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 24 Feb 2015 16:30:21 +0000 Subject: [PATCH 011/420] UPSTREAM: arm64: Fix text patching logic when using fixmap Patch 2f896d586610 ("arm64: use fixmap for text patching") changed the way we patch the kernel text, using a fixmap when the kernel or modules are flagged as read only. Unfortunately, a flaw in the logic makes it fall over when patching modules without CONFIG_DEBUG_SET_MODULE_RONX enabled: [...] [ 32.032636] Call trace: [ 32.032716] [] __copy_to_user+0x2c/0x60 [ 32.032837] [] __aarch64_insn_write+0x94/0xf8 [ 32.033027] [] aarch64_insn_patch_text_nosync+0x18/0x58 [ 32.033200] [] ftrace_modify_code+0x58/0x84 [ 32.033363] [] ftrace_make_nop+0x3c/0x58 [ 32.033532] [] ftrace_process_locs+0x3d0/0x5c8 [ 32.033709] [] ftrace_module_init+0x28/0x34 [ 32.033882] [] load_module+0xbb8/0xfc4 [ 32.034044] [] SyS_finit_module+0x94/0xc4 [...] This is triggered by the use of virt_to_page() on a module address, which ends to pointing to Nowhereland if you're lucky, or corrupt your precious data if not. This patch fixes the logic by mimicking what is done on arm: - If we're patching a module and CONFIG_DEBUG_SET_MODULE_RONX is set, use vmalloc_to_page(). - If we're patching the kernel and CONFIG_DEBUG_RODATA is set, use virt_to_page(). - Otherwise, use the provided address, as we can write to it directly. Tested on 4.0-rc1 as a KVM guest. Reported-by: Richard W.M. Jones Reviewed-by: Kees Cook Acked-by: Mark Rutland Acked-by: Laura Abbott Tested-by: Richard W.M. Jones Cc: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Catalin Marinas (cherry picked from commit f6242cac10427c546271050b31c891a078e490cd) Reported-by: Jon Medhurst Signed-off-by: Amit Pundir --- arch/arm64/kernel/insn.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 27d4864577e5d4..c8eca88f12e6b2 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -87,8 +87,10 @@ static void __kprobes *patch_map(void *addr, int fixmap) if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX)) page = vmalloc_to_page(addr); - else + else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA)) page = virt_to_page(addr); + else + return addr; BUG_ON(!page); set_fixmap(fixmap, page_to_phys(page)); From 0fc44626ac8f4ae37d9f2e7c44687b261320d274 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 22 Apr 2016 00:00:14 -0700 Subject: [PATCH 012/420] vfs: change d_canonical_path to take two paths bug: 23904372 Change-Id: I4a686d64b6de37decf60019be1718e1d820193e6 Signed-off-by: Daniel Rosenberg --- fs/notify/inotify/inotify_user.c | 2 +- fs/sdcardfs/dentry.c | 6 +++++- include/linux/dcache.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 8637f33b826d56..ed833baf73db96 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -735,7 +735,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, /* support stacked filesystems */ if(path.dentry && path.dentry->d_op) { if (path.dentry->d_op->d_canonical_path) { - path.dentry->d_op->d_canonical_path(path.dentry, &alteredpath); + path.dentry->d_op->d_canonical_path(&path, &alteredpath); canonical_path = &alteredpath; path_put(&path); } diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index ba165ef11e272b..971928ab6c216e 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -172,11 +172,15 @@ static int sdcardfs_cmp_ci(const struct dentry *parent, return 1; } +static void sdcardfs_canonical_path(const struct path *path, struct path *actual_path) { + sdcardfs_get_real_lower(path->dentry, actual_path); +} + const struct dentry_operations sdcardfs_ci_dops = { .d_revalidate = sdcardfs_d_revalidate, .d_release = sdcardfs_d_release, .d_hash = sdcardfs_hash_ci, .d_compare = sdcardfs_cmp_ci, - .d_canonical_path = sdcardfs_get_real_lower, + .d_canonical_path = sdcardfs_canonical_path, }; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 654b4db7f68034..f62a1ce6fae1be 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -160,7 +160,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); - void (*d_canonical_path)(const struct dentry *, struct path *); + void (*d_canonical_path)(const struct path *, struct path *); } ____cacheline_aligned; /* From e780170a456178a676720f1b846332cd0cf0bd00 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 22 Apr 2016 00:00:48 -0700 Subject: [PATCH 013/420] fuse: Add support for d_canonical_path Allows FUSE to report to inotify that it is acting as a layered filesystem. The userspace component returns a string representing the location of the underlying file. If the string cannot be resolved into a path, the top level path is returned instead. bug: 23904372 Change-Id: Iabdca0bbedfbff59e9c820c58636a68ef9683d9f Signed-off-by: Daniel Rosenberg --- fs/fuse/dev.c | 5 +++++ fs/fuse/dir.c | 45 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 1 + 4 files changed, 54 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 54792719c7ea3f..fcbefeb3b52e86 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1865,6 +1866,10 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, spin_unlock(&fc->lock); err = copy_out_args(cs, &req->out, nbytes); + if (req->in.h.opcode == FUSE_CANONICAL_PATH) { + req->out.h.error = kern_path((char *)req->out.args[0].value, 0, + req->canonical_path); + } fuse_copy_finish(cs); spin_lock(&fc->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dbab798f5cafef..2f1513573c6c34 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -277,6 +277,50 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; } +/* + * Get the canonical path. Since we must translate to a path, this must be done + * in the context of the userspace daemon, however, the userspace daemon cannot + * look up paths on its own. Instead, we handle the lookup as a special case + * inside of the write request. + */ +static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) { + struct inode *inode = path->dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int err; + char *path_name; + + req = fuse_get_req(fc, 1); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto default_path; + + path_name = (char*)__get_free_page(GFP_KERNEL); + if (!path_name) { + fuse_put_request(fc, req); + goto default_path; + } + + req->in.h.opcode = FUSE_CANONICAL_PATH; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 0; + req->out.numargs = 1; + req->out.args[0].size = PATH_MAX; + req->out.args[0].value = path_name; + req->canonical_path = canonical_path; + req->out.argvar = 1; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + free_page((unsigned long)path_name); + if (!err) + return; +default_path: + canonical_path->dentry = path->dentry; + canonical_path->mnt = path->mnt; + path_get(canonical_path); +} + static int invalid_nodeid(u64 nodeid) { return !nodeid || nodeid == FUSE_ROOT_ID; @@ -284,6 +328,7 @@ static int invalid_nodeid(u64 nodeid) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_canonical_path = fuse_dentry_canonical_path, }; int fuse_valid_type(int m) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e47a6ab5186b..84c341c518b4e1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -351,6 +351,9 @@ struct fuse_req { /** Inode used in the request or NULL */ struct inode *inode; + /** Path used for completing d_canonical_path */ + struct path *canonical_path; + /** AIO control block */ struct fuse_io_priv *io; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 25084a052a1eff..7f6b325e1dd56f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -358,6 +358,7 @@ enum fuse_opcode { FUSE_FALLOCATE = 43, FUSE_READDIRPLUS = 44, FUSE_RENAME2 = 45, + FUSE_CANONICAL_PATH= 2016, /* CUSE specific operations */ CUSE_INIT = 4096, From 8ad66cafaa10e6ba94ff79a8dbc2cc437c6bfe93 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Wed, 23 Mar 2016 13:18:03 -0700 Subject: [PATCH 014/420] usb: dual-role: make stub functions inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_DUAL_ROLE_USB_INTF is disabled but the exported functions are referenced, the build will result in warnings such as: In file included from include/linux/usb/class-dual-role.h:112:13: warning: ‘dual_role_instance_changed’ defined but not used [-Wunused-function] These stub functions should be static inline. Change-Id: I5a9ef58dca32306fac5a4c7f28cdaa36fa8ae078 Signed-off-by: Jack Pham (cherry picked from commit 2d152dbb0743526b21d6bbefe097f874c027f860) --- include/linux/usb/class-dual-role.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h index af42ed34944a85..c6df2238012e4b 100644 --- a/include/linux/usb/class-dual-role.h +++ b/include/linux/usb/class-dual-role.h @@ -109,18 +109,19 @@ extern int dual_role_property_is_writeable(struct dual_role_phy_instance enum dual_role_property prop); extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role); #else /* CONFIG_DUAL_ROLE_USB_INTF */ -static void dual_role_instance_changed(struct dual_role_phy_instance +static inline void dual_role_instance_changed(struct dual_role_phy_instance *dual_role){} -static struct dual_role_phy_instance *__must_check +static inline struct dual_role_phy_instance *__must_check devm_dual_role_instance_register(struct device *parent, const struct dual_role_phy_desc *desc) { return ERR_PTR(-ENOSYS); } -static void devm_dual_role_instance_unregister(struct device *dev, +static inline void devm_dual_role_instance_unregister(struct device *dev, struct dual_role_phy_instance *dual_role){} -static void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role) +static inline void *dual_role_get_drvdata(struct dual_role_phy_instance + *dual_role) { return ERR_PTR(-ENOSYS); } From f3d118221beeaf5804ed14332a8ade0355f05c67 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Mon, 2 May 2016 17:29:28 +0200 Subject: [PATCH 015/420] dm: fix dm_substitute_devices() When candidate is the last parameter, candidate_end points to the '\0' character and not the DM_FIELD_SEP character. In such a situation, we should not move the candidate_end pointer one character backward. (Cherry-picked from https://android-review.googlesource.com/#/c/221680/) Signed-off-by: Jeremy Compostella Change-Id: Ifc042e07da99fcfdfd46811d4208b39b427a6d99 --- init/do_mounts_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index f521bc5ae24841..ecda58df9a19ed 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -176,7 +176,8 @@ static void __init dm_substitute_devices(char *str, size_t str_len) continue; /* Temporarily terminate with a nul */ - candidate_end--; + if (*candidate_end) + candidate_end--; old_char = *candidate_end; *candidate_end = '\0'; From 3d8d9238142db90c845860ec98aa77733cc47dd6 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Fri, 15 Apr 2016 13:32:54 +0200 Subject: [PATCH 016/420] ANDROID: dm: use name_to_dev_t This patch makes android_verity_ctr() parse its block device string parameter with name_to_dev_t(). It allows the use of less hardware related block device reference like PARTUUID for instance. (Cherry-picked from https://android-review.googlesource.com/#/c/215901) Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf07 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index b7e059595f75d0..9c26cbb5f1797b 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -613,8 +613,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* One for specifying number of opt args and one for mode */ sector_t data_sectors; u32 data_block_size; - unsigned int major, minor, - no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; @@ -630,13 +629,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) key_id = argv[0]; strreplace(argv[0], '#', ' '); - if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { - dev = MKDEV(major, minor); - if (MAJOR(dev) != major || MINOR(dev) != minor) { - DMERR("Incorrect bdev major minor number"); - handle_error(); - return -EOVERFLOW; - } + dev = name_to_dev_t(argv[1]); + if (!dev) { + DMERR("no dev found for %s", argv[1]); + handle_error(); + return -EINVAL; } DMINFO("key:%s dev:%s", argv[0], argv[1]); From 91bd6b66f3060ae26b9c1f04319394690cc27038 Mon Sep 17 00:00:00 2001 From: Janis Danisevskis Date: Thu, 14 Apr 2016 13:57:03 +0100 Subject: [PATCH 017/420] UPSTREAM: procfs: fixes pthread cross-thread naming if !PR_DUMPABLE The PR_DUMPABLE flag causes the pid related paths of the proc file system to be owned by ROOT. The implementation of pthread_set/getname_np however needs access to /proc//task//comm. If PR_DUMPABLE is false this implementation is locked out. This patch installs a special permission function for the file "comm" that grants read and write access to all threads of the same group regardless of the ownership of the inode. For all other threads the function falls back to the generic inode permission check. Signed-off-by: Janis Danisevskis --- fs/proc/base.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 35eabfe74346c8..1c952a094ccc09 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2829,6 +2829,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; } +/* + * proc_tid_comm_permission is a special permission function exclusively + * used for the node /proc//task//comm. + * It bypasses generic permission checks in the case where a task of the same + * task group attempts to access the node. + * The rational behind this is that glibc and bionic access this node for + * cross thread naming (pthread_set/getname_np(!self)). However, if + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, + * which locks out the cross thread naming implementation. + * This function makes sure that the node is always accessible for members of + * same thread group. + */ +static int proc_tid_comm_permission(struct inode *inode, int mask) +{ + bool is_same_tgroup; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + is_same_tgroup = same_thread_group(current, task); + put_task_struct(task); + + if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { + /* This file (/proc//task//comm) can always be + * read or written by the members of the corresponding + * thread group. + */ + return 0; + } + + return generic_permission(inode, mask); +} + +static const struct inode_operations proc_tid_comm_inode_operations = { + .permission = proc_tid_comm_permission, +}; + /* * Tasks */ @@ -2847,7 +2885,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif - REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), + NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, + &proc_tid_comm_inode_operations, + &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif From 8c69da180422faad35003b3996d3adcef54db007 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 10 Jan 2016 22:40:55 -0800 Subject: [PATCH 018/420] UPSTREAM: tty: Fix unsafe ldisc reference via ioctl(TIOCGETD) (cherry pick from commit 5c17c861a357e9458001f021a7afa7aab9937439) ioctl(TIOCGETD) retrieves the line discipline id directly from the ldisc because the line discipline id (c_line) in termios is untrustworthy; userspace may have set termios via ioctl(TCSETS*) without actually changing the line discipline via ioctl(TIOCSETD). However, directly accessing the current ldisc via tty->ldisc is unsafe; the ldisc ptr dereferenced may be stale if the line discipline is changing via ioctl(TIOCSETD) or hangup. Wait for the line discipline reference (just like read() or write()) to retrieve the "current" line discipline id. Cc: Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Bug: 28409131 Change-Id: If4be351f4d62f07a054b4822dfa743a68811c921 --- drivers/tty/tty_io.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 0508a1d8e4cd73..ca1ae2d90ea8af 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2593,6 +2593,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p) return ret; } +/** + * tiocgetd - get line discipline + * @tty: tty device + * @p: pointer to user data + * + * Retrieves the line discipline id directly from the ldisc. + * + * Locking: waits for ldisc reference (in case the line discipline + * is changing or the tty is being hungup) + */ + +static int tiocgetd(struct tty_struct *tty, int __user *p) +{ + struct tty_ldisc *ld; + int ret; + + ld = tty_ldisc_ref_wait(tty); + ret = put_user(ld->ops->num, p); + tty_ldisc_deref(ld); + return ret; +} + /** * send_break - performed time break * @tty: device to break on @@ -2807,7 +2829,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case TIOCGSID: return tiocgsid(tty, real_tty, p); case TIOCGETD: - return put_user(tty->ldisc->ops->num, (int __user *)p); + return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: From 0bec9701b94804e772aa1f98347651368ede1867 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 16 Dec 2015 13:32:38 -0500 Subject: [PATCH 019/420] UPSTREAM: USB: fix invalid memory access in hub_activate() (cherry pick from commit e50293ef9775c5f1cf3fcc093037dd6a8c5684ea) Commit 8520f38099cc ("USB: change hub initialization sleeps to delayed_work") changed the hub_activate() routine to make part of it run in a workqueue. However, the commit failed to take a reference to the usb_hub structure or to lock the hub interface while doing so. As a result, if a hub is plugged in and quickly unplugged before the work routine can run, the routine will try to access memory that has been deallocated. Or, if the hub is unplugged while the routine is running, the memory may be deallocated while it is in active use. This patch fixes the problem by taking a reference to the usb_hub at the start of hub_activate() and releasing it at the end (when the work is finished), and by locking the hub interface while the work routine is running. It also adds a check at the start of the routine to see if the hub has already been disconnected, in which nothing should be done. Signed-off-by: Alan Stern Reported-by: Alexandru Cornea Tested-by: Alexandru Cornea Fixes: 8520f38099cc ("USB: change hub initialization sleeps to delayed_work") CC: Signed-off-by: Greg Kroah-Hartman Bug: 28712303 Change-Id: I33d3a93968122b8a58911a2edf4cea65d49134b5 --- drivers/usb/core/hub.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index b649fef2e35d4a..1c0367f765d56b 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1030,10 +1030,20 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) unsigned delay; /* Continue a partial initialization */ - if (type == HUB_INIT2) - goto init2; - if (type == HUB_INIT3) + if (type == HUB_INIT2 || type == HUB_INIT3) { + device_lock(hub->intfdev); + + /* Was the hub disconnected while we were waiting? */ + if (hub->disconnected) { + device_unlock(hub->intfdev); + kref_put(&hub->kref, hub_release); + return; + } + if (type == HUB_INIT2) + goto init2; goto init3; + } + kref_get(&hub->kref); /* The superspeed hub except for root hub has to use Hub Depth * value as an offset into the route string to locate the bits @@ -1231,6 +1241,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) queue_delayed_work(system_power_efficient_wq, &hub->init_work, msecs_to_jiffies(delay)); + device_unlock(hub->intfdev); return; /* Continues at init3: below */ } else { msleep(delay); @@ -1252,6 +1263,11 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Allow autosuspend if it was suppressed */ if (type <= HUB_INIT3) usb_autopm_put_interface_async(to_usb_interface(hub->intfdev)); + + if (type == HUB_INIT2 || type == HUB_INIT3) + device_unlock(hub->intfdev); + + kref_put(&hub->kref, hub_release); } /* Implement the continuations for the delays above */ From 108aec56a072e3eda977a51cedbe09e25f9cd10e Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Fri, 20 May 2016 11:05:00 +0800 Subject: [PATCH 020/420] ANDROID: usb: gadget: f_midi: set fi->f to NULL when free f_midi function fi->f is set in f_midi's alloc_func, need to clean this to NULL in free_func, otherwise on ConfigFS's function switch, midi->usb_function it self is freed, fi->f will be a wild pointer and run into below kernel panic: --------------- [ 58.950628] Unable to handle kernel paging request at virtual address 63697664 [ 58.957869] pgd = c0004000 [ 58.960583] [63697664] *pgd=00000000 [ 58.964185] Internal error: Oops: 80000005 [#1] PREEMPT SMP ARM [ 58.970111] Modules linked in: [ 58.973191] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.1.15-03504-g34c857c-dirty #89 [ 58.981024] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 58.987557] task: c110bd70 ti: c1100000 task.ti: c1100000 [ 58.992962] PC is at 0x63697664 [ 58.996120] LR is at android_setup+0x78/0x138 <..snip..> [ 60.044980] 1fc0: ffffffff ffffffff c1000684 00000000 00000000 c108ecd0 c11f7294 c11039c0 [ 60.053181] 1fe0: c108eccc c110d148 1000406a 412fc09a 00000000 1000807c 00000000 00000000 [ 60.061420] [] (android_setup) from [] (udc_irq+0x758/0x1034) [ 60.068951] [] (udc_irq) from [] (handle_irq_event_percpu+0x50/0x254) [ 60.077165] [] (handle_irq_event_percpu) from [] (handle_irq_event+0x3c/0x5c) [ 60.086072] [] (handle_irq_event) from [] (handle_fasteoi_irq+0xe0/0x198) [ 60.094630] [] (handle_fasteoi_irq) from [] (generic_handle_irq+0x2c/0x3c) [ 60.103271] [] (generic_handle_irq) from [] (__handle_domain_irq+0x7c/0xec) [ 60.112000] [] (__handle_domain_irq) from [] (gic_handle_irq+0x24/0x5c) -------------- Signed-off-by: Winter Wang --- drivers/usb/gadget/function/f_midi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index ee1bfc905fb940..cc7d59a09abeaf 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -1148,6 +1148,7 @@ static void f_midi_free(struct usb_function *f) for (i = opts->in_ports - 1; i >= 0; --i) kfree(midi->in_port[i]); kfree(midi); + opts->func_inst.f = NULL; --opts->refcnt; mutex_unlock(&opts->lock); } From 5364b5ca0b1a12a58283b51408e43fc36d4e4fe7 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:44:19 -0700 Subject: [PATCH 021/420] ANDROID: dm: fix signature verification flag The bug was that the signature verification was only happening when verity was disabled. It should always happen when verity is enabled. Signed-off-by: Badhri Jagan Sridharan Change-Id: I2d9354e240d36ea06fc68c2d18d8e87b823a4c2f --- drivers/md/dm-android-verity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 9c26cbb5f1797b..00275a986d037d 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -667,7 +667,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto free_metadata; } - if (!verity_enabled) { + if (verity_enabled) { err = verify_verity_signature(key_id, metadata); if (err) { From c5c74d0327729f35b576564976885596c6d0e7fb Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:45:45 -0700 Subject: [PATCH 022/420] ANDROID: dm: use default verity public key If the dm-android-verity target does not provide a default key try using the default public key from the system keyring. The defualt verity keyid is passed as a kernel command line argument veritykeyid=. The order of the dm-android-verity params have been reversed to facilitate the change. Old format example: dm="system none ro,0 1 android-verity Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f /dev/mmcblk0p43" New formats supported: dm="system none ro,0 1 android-verity /dev/mmcblk0p43 Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f" (or) dm="system none ro,0 1 android-verity /dev/mmcblk0p43" when veritykeyid= is set in the kernel command line. BUG: 28384658 Signed-off-by: Badhri Jagan Sridharan Change-Id: I506c89b053d835ab579e703eef2bc1f8487250de --- drivers/md/dm-android-verity.c | 67 ++++++++++++++++++++++++---------- drivers/md/dm-android-verity.h | 16 ++++++++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 00275a986d037d..097fb2b1de8917 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -43,6 +43,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; static bool target_added; static bool verity_enabled = true; @@ -79,6 +80,19 @@ static int __init verity_mode_param(char *line) __setup("androidboot.veritymode=", verity_mode_param); +static int __init verity_keyid_param(char *line) +{ + strlcpy(veritykeyid, line, sizeof(veritykeyid)); + return 1; +} + +__setup("veritykeyid=", verity_keyid_param); + +static inline bool default_verity_key_id(void) +{ + return veritykeyid[0] != '\0'; +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -608,7 +622,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) dev_t uninitialized_var(dev); struct android_metadata *uninitialized_var(metadata); int err = 0, i, mode; - char *key_id, *table_ptr, dummy, + char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; /* One for specifying number of opt args and one for mode */ sector_t data_sectors; @@ -619,24 +633,34 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - if (argc != 2) { + if (argc == 1) { + /* Use the default keyid */ + if (default_verity_key_id()) + key_id = veritykeyid; + else { + DMERR("veritykeyid= is not set"); + handle_error(); + return -EINVAL; + } + } else if (argc == 2) + key_id = argv[1]; + else { DMERR("Incorrect number of arguments"); handle_error(); return -EINVAL; } - /* should come as one of the arguments for the verity target */ - key_id = argv[0]; - strreplace(argv[0], '#', ' '); + strreplace(key_id, '#', ' '); + target_device = argv[0]; - dev = name_to_dev_t(argv[1]); + dev = name_to_dev_t(target_device); if (!dev) { - DMERR("no dev found for %s", argv[1]); + DMERR("no dev found for %s", target_device); handle_error(); return -EINVAL; } - DMINFO("key:%s dev:%s", argv[0], argv[1]); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { DMERR("Error while extracting fec header"); @@ -734,30 +758,33 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* Setup linear target and free */ if (!verity_enabled) { - err = add_as_linear_device(ti, argv[1]); + err = add_as_linear_device(ti, target_device); goto free_metadata; } /*substitute data_dev and hash_dev*/ - verity_table_args[1] = argv[1]; - verity_table_args[2] = argv[1]; + verity_table_args[1] = target_device; + verity_table_args[2] = target_device; mode = verity_mode(); if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, - "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, - 1 + VERITY_TABLE_OPT_FEC_ARGS, - mode == DM_VERITY_MODE_RESTART ? - VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, - argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, - ecc.roots); + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : + VERITY_TABLE_OPT_LOGGING, + target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } else { err = snprintf(buf, FEC_ARG_LENGTH, - "%u " VERITY_TABLE_OPT_FEC_FORMAT, - VERITY_TABLE_OPT_FEC_ARGS, argv[1], - ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } } else if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index efb7965248968c..43655ee0f813fb 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -27,6 +27,22 @@ #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +/* + * : is the format for the identifier. + * subject can either be the Common Name(CN) + Organization Name(O) or + * just the CN if the it is prefixed with O + * From https://tools.ietf.org/html/rfc5280#appendix-A + * ub-organization-name-length INTEGER ::= 64 + * ub-common-name-length INTEGER ::= 64 + * + * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278 + * ctx->o_size + 2 + ctx->cn_size + 1 + * + 41 characters for ":" and sha1 id + * 64 + 2 + 64 + 1 + 1 + 40 (172) + * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters. + */ +#define VERITY_DEFAULT_KEY_ID_LENGTH 200 + #define FEC_MAGIC 0xFECFECFE #define FEC_BLOCK_SIZE (4 * 1024) #define FEC_VERSION 0 From 925d82a466131093dee9a301372f4c29a28d948b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 19 Jan 2016 21:35:15 +0000 Subject: [PATCH 023/420] BACKPORT: perf tools: Document the perf sysctls perf_event_paranoid was only documented in source code and a perf error message. Copy the documentation from the error message to Documentation/sysctl/kernel.txt. perf_cpu_time_max_percent was already documented but missing from the list at the top, so add it there. Signed-off-by: Ben Hutchings Cc: Peter Zijlstra Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20160119213515.GG2637@decadent.org.uk [ Remove reference to external Documentation file, provide info inline, as before ] Signed-off-by: Arnaldo Carvalho de Melo Bug: 29054680 Change-Id: I13e73cfb2ad761c94762d0c8196df7725abdf5c5 --- Documentation/sysctl/kernel.txt | 41 ++++++++++++++++++++++----------- tools/perf/util/evsel.c | 15 +++++++----- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 57baff5bdb806b..fb7082cda511dd 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -54,8 +54,10 @@ show up in /proc/sys/kernel: - overflowuid - panic - panic_on_oops -- panic_on_unrecovered_nmi - panic_on_stackoverflow +- panic_on_unrecovered_nmi +- perf_cpu_time_max_percent +- perf_event_paranoid - pid_max - powersave-nap [ PPC only ] - printk @@ -527,19 +529,6 @@ the recommended setting is 60. ============================================================== -panic_on_unrecovered_nmi: - -The default Linux behaviour on an NMI of either memory or unknown is -to continue operation. For many environments such as scientific -computing it is preferable that the box is taken out and the error -dealt with than an uncorrected parity/ECC error get propagated. - -A small number of systems do generate NMI's for bizarre random reasons -such as power management so the default is off. That sysctl works like -the existing panic controls already in that directory. - -============================================================== - panic_on_oops: Controls the kernel's behaviour when an oops or BUG is encountered. @@ -563,6 +552,19 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. ============================================================== +panic_on_unrecovered_nmi: + +The default Linux behaviour on an NMI of either memory or unknown is +to continue operation. For many environments such as scientific +computing it is preferable that the box is taken out and the error +dealt with than an uncorrected parity/ECC error get propagated. + +A small number of systems do generate NMI's for bizarre random reasons +such as power management so the default is off. That sysctl works like +the existing panic controls already in that directory. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to @@ -589,6 +591,17 @@ allowed to execute. ============================================================== +perf_event_paranoid: + +Controls use of the performance events system by unprivileged +users (without CAP_SYS_ADMIN). The default value is 1. + + -1: Allow use of (almost) all events by all users +>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK +>=1: Disallow CPU event access by users without CAP_SYS_ADMIN +>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN + +============================================================== pid_max: diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2f9e68025ede66..02333234a0960e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2067,12 +2067,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, case EPERM: case EACCES: return scnprintf(msg, size, - "You may not have permission to collect %sstats.\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n" - " -1 - Not paranoid at all\n" - " 0 - Disallow raw tracepoint access for unpriv\n" - " 1 - Disallow cpu events for unpriv\n" - " 2 - Disallow kernel profiling for unpriv", + "You may not have permission to collect %sstats.\n\n" + "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" + "which controls use of the performance events system by\n" + "unprivileged users (without CAP_SYS_ADMIN).\n\n" + "The default value is 1:\n\n" + " -1: Allow use of (almost) all events by all users\n" + ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n" + ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" + ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN", target->system_wide ? "system-wide " : ""); case ENOENT: return scnprintf(msg, size, "The %s event is not supported.", From 012b0adcf7299f6509d4984cf46ee11e6eaed4e4 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 29 May 2016 14:22:32 -0700 Subject: [PATCH 024/420] FROMLIST: security,perf: Allow further restriction of perf_event_open When kernel.perf_event_open is set to 3 (or greater), disallow all access to performance events by users without CAP_SYS_ADMIN. Add a Kconfig symbol CONFIG_SECURITY_PERF_EVENTS_RESTRICT that makes this value the default. This is based on a similar feature in grsecurity (CONFIG_GRKERNSEC_PERF_HARDEN). This version doesn't include making the variable read-only. It also allows enabling further restriction at run-time regardless of whether the default is changed. https://lkml.org/lkml/2016/1/11/587 Signed-off-by: Ben Hutchings Bug: 29054680 Change-Id: Iff5bff4fc1042e85866df9faa01bce8d04335ab8 --- Documentation/sysctl/kernel.txt | 4 +++- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 ++++++++ security/Kconfig | 9 +++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index fb7082cda511dd..d60dbf7176d0f3 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -594,12 +594,14 @@ allowed to execute. perf_event_paranoid: Controls use of the performance events system by unprivileged -users (without CAP_SYS_ADMIN). The default value is 1. +users (without CAP_SYS_ADMIN). The default value is 3 if +CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 1 otherwise. -1: Allow use of (almost) all events by all users >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK >=1: Disallow CPU event access by users without CAP_SYS_ADMIN >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN +>=3: Disallow all event access by users without CAP_SYS_ADMIN ============================================================== diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 893a0d07986f52..948b07d0cd718d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -749,6 +749,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, loff_t *ppos); +static inline bool perf_paranoid_any(void) +{ + return sysctl_perf_event_paranoid > 2; +} + static inline bool perf_paranoid_tracepoint_raw(void) { return sysctl_perf_event_paranoid > -1; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cd5eef1fcddf3..da4ed38336b39f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -170,8 +170,13 @@ static struct srcu_struct pmus_srcu; * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv + * 3 - disallow all unpriv perf event use */ +#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT +int sysctl_perf_event_paranoid __read_mostly = 3; +#else int sysctl_perf_event_paranoid __read_mostly = 1; +#endif /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -7237,6 +7242,9 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; diff --git a/security/Kconfig b/security/Kconfig index beb86b500adffd..37e537db577cc8 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT If you are unsure how to answer this question, answer N. +config SECURITY_PERF_EVENTS_RESTRICT + bool "Restrict unprivileged use of performance events" + depends on PERF_EVENTS + help + If you say Y here, the kernel.perf_event_paranoid sysctl + will be set to 3 by default, and no unprivileged use of the + perf_event_open syscall will be permitted unless it is + changed. + config SECURITY bool "Enable different security models" depends on SYSFS From 15c897f31ba18f67559d6b7f1a6afa855baa756c Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Wed, 1 Jun 2016 13:44:47 -0700 Subject: [PATCH 025/420] ANDROID: restrict access to perf events Add: CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y to android-base.cfg The kernel.perf_event_paranoid sysctl is set to 3 by default. No unprivileged use of the perf_event_open syscall will be permitted unless it is changed. Bug: 29054680 Change-Id: Ie7512259150e146d8e382dc64d40e8faaa438917 --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 85e4a932f5ef67..f820d56b39ccd2 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -143,6 +143,7 @@ CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y CONFIG_SECURITY_SELINUX=y CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y From 21c0fe9f24b7707d2b49401f8c740c3e35c580ea Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jun 2016 14:06:14 -0700 Subject: [PATCH 026/420] ANDROID: dm verity fec: limit error correction recursion If verity tree itself is sufficiently corrupted in addition to data blocks, it's possible for error correction to end up in a deep recursive error correction loop that eventually causes a kernel panic as follows: [ 14.728962] [] verity_fec_decode+0xa8/0x138 [ 14.734691] [] verity_verify_level+0x11c/0x180 [ 14.740681] [] verity_hash_for_block+0x88/0xe0 [ 14.746671] [] fec_decode_rsb+0x318/0x75c [ 14.752226] [] verity_fec_decode+0xa8/0x138 [ 14.757956] [] verity_verify_level+0x11c/0x180 [ 14.763944] [] verity_hash_for_block+0x88/0xe0 This change limits the recursion to a reasonable level during a single I/O operation. Bug: 28943429 Signed-off-by: Sami Tolvanen Change-Id: I0a7ebff331d259c59a5e03c81918cc1613c3a766 (cherry picked from commit f4b9e40597e73942d2286a73463c55f26f61bfa7) --- drivers/md/dm-verity-fec.c | 11 ++++++++++- drivers/md/dm-verity-fec.h | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index ad10d6d8ed28dd..b26809a47ca37d 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -442,6 +442,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; + if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { + DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + return -EIO; + } + + fio->level++; + if (type == DM_VERITY_BLOCK_TYPE_METADATA) block += v->data_blocks; @@ -475,7 +482,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (r < 0) { r = fec_decode_rsb(v, io, fio, rsb, offset, true); if (r < 0) - return r; + goto done; } if (dest) @@ -485,6 +492,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, r = verity_for_bv_block(v, io, iter, fec_bv_copy); } +done: + fio->level--; return r; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 8c4bee052a7354..b8e21cef3ad193 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -28,6 +28,9 @@ #define DM_VERITY_FEC_BUF_MAX \ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) +/* maximum recursion level for verity_fec_decode */ +#define DM_VERITY_FEC_MAX_RECURSION 4 + #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" #define DM_VERITY_OPT_FEC_START "fec_start" @@ -61,6 +64,7 @@ struct dm_verity_fec_io { unsigned nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ size_t output_pos; + unsigned level; /* recursion level */ }; #ifdef CONFIG_DM_VERITY_FEC From 71a50776c381cb2a57b7df1b9820f92e6bb08219 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jun 2016 14:22:46 -0700 Subject: [PATCH 027/420] ANDROID: dm verity fec: add missing release from fec_ktype Add a release function to allow destroying the dm-verity device. Bug: 27928374 Signed-off-by: Sami Tolvanen Change-Id: Ic0f7c17e4889c5580d70b52d9a709a37165a5747 (cherry picked from commit 0039ccf47c8f99888f7b71b2a36a68a027fbe357) --- drivers/md/dm-verity-fec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index b26809a47ca37d..454535d23a7f4b 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -689,7 +689,8 @@ static struct attribute *fec_attrs[] = { static struct kobj_type fec_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = fec_attrs + .default_attrs = fec_attrs, + .release = dm_kobject_release }; /* From 2261cfcefb17e99f69dbc564c6aaa06d77738bad Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Tue, 10 May 2016 13:10:20 +0200 Subject: [PATCH 028/420] ANDROID: dm verity fec: pack the fec_header structure The fec_header structure is generated build time and stored on disk. The fec_header might be build on a 64 bits machine while it is read per a 32 bits device or the other way around. In such situations, the fec_header fields are not aligned as expected by the device and it fails to read the fec_header structure. This patch makes the fec_header packed. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf06 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 43655ee0f813fb..52c48df94c0509 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -71,9 +71,6 @@ * if fec is not present * */ -/* TODO: rearrange structure to reduce memory holes - * depends on userspace change. - */ struct fec_header { __le32 magic; __le32 version; @@ -82,7 +79,7 @@ struct fec_header { __le32 fec_size; __le64 inp_size; u8 hash[SHA256_DIGEST_SIZE]; -}; +} __attribute__((packed)); struct android_metadata_header { __le32 magic_number; From bfc3cd72af24bba6b6705f8aa633df7b4d522bf7 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 17 Jun 2016 11:22:03 -0700 Subject: [PATCH 029/420] ANDROID: dm verity fec: fix RS block calculation A call to do_div was changed in Linux 4.5 to div64_u64 in verity_fec_decode, which broke RS block calculation due to incompatible semantics. This change fixes the computation. Bug: 21893453 Change-Id: Idb88b901e0209c2cccc9c0796689f780592d58f9 Signed-off-by: Sami Tolvanen (cherry picked from commit 879aac93eebcc2862d71afa9eca3a0c0f51b3b01) --- drivers/md/dm-verity-fec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 454535d23a7f4b..a1e8571ce31450 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -463,9 +463,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, */ offset = block << v->data_dev_block_bits; - - res = offset; - div64_u64(res, v->fec->rounds << v->data_dev_block_bits); + res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); /* * The base RS block we can feed to the interleaver to find out all From 9629765fd5b880c444cbfb870936870a5ea42b5f Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 17 Jun 2016 11:31:17 -0700 Subject: [PATCH 030/420] ANDROID: dm verity fec: initialize recursion level Explicitly initialize recursion level to zero at the beginning of each I/O operation. Bug: 28943429 Change-Id: I00c612be2b8c22dd5afb65a739551df91cb324fc Signed-off-by: Sami Tolvanen (cherry picked from commit 32ffb3a22d7fd269b2961323478ece92c06a8334) --- drivers/md/dm-verity-fec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index a1e8571ce31450..1dd667b975307e 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -532,6 +532,7 @@ void verity_fec_init_io(struct dm_verity_io *io) memset(fio->bufs, 0, sizeof(fio->bufs)); fio->nbufs = 0; fio->output = NULL; + fio->level = 0; } /* From 224c2acd13bf8ebe5bedffdeb1483eb3ff771d5e Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 7 Mar 2016 11:31:10 +0100 Subject: [PATCH 031/420] UPSTREAM: usbnet: cleanup after bind() in probe() (cherry pick from commit 1666984c8625b3db19a9abc298931d35ab7bc64b) In case bind() works, but a later error forces bailing in probe() in error cases work and a timer may be scheduled. They must be killed. This fixes an error case related to the double free reported in http://www.spinics.net/lists/netdev/msg367669.html and needs to go on top of Linus' fix to cdc-ncm. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller Bug: 28744625 --- drivers/net/usb/usbnet.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 3a6770a65d7836..cbedffc2ec50c2 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1743,6 +1743,13 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) if (info->unbind) info->unbind (dev, udev); out1: + /* subdrivers must undo all they did in bind() if they + * fail it, but we may fail later and a deferred kevent + * may trigger an error resubmitting itself and, worse, + * schedule a timer. So we kill it all just in case. + */ + cancel_work_sync(&dev->kevent); + del_timer_sync(&dev->delay); free_netdev(net); out: return status; From 18dbfe4b134275d2fb7ccb313875e37193d086e4 Mon Sep 17 00:00:00 2001 From: Thierry Strudel Date: Tue, 14 Jun 2016 17:46:44 -0700 Subject: [PATCH 032/420] cpu: send KOBJ_ONLINE event when enabling cpus In case some sysfs nodes needs to be labeled with a different label than sysfs then user needs to be notified when a core is brought back online. Bug: 29359497 Change-Id: I0395c86e01cd49c348fda8f93087d26f88557c91 Signed-off-by: Thierry Strudel --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index f85feb62c73a10..28cb74db9646d3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -591,6 +591,7 @@ void __weak arch_enable_nonboot_cpus_end(void) void __ref enable_nonboot_cpus(void) { int cpu, error; + struct device *cpu_device; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); @@ -608,6 +609,12 @@ void __ref enable_nonboot_cpus(void) trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); + cpu_device = get_cpu_device(cpu); + if (!cpu_device) + pr_err("%s: failed to get cpu%d device\n", + __func__, cpu); + else + kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE); continue; } pr_warn("Error taking CPU%d up: %d\n", cpu, error); From e47d455f06a29624932f9a5f9a0e2d2064fcb1cf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jun 2015 13:58:52 +0200 Subject: [PATCH 033/420] BACKPORT: netfilter: Kconfig: get rid of parens around depends on (cherry pick from commit f09becc79f899f92557ce6d5562a8b80d6addb34) According to the reporter, they are not needed. Reported-by: Sergei Shtylyov Signed-off-by: Pablo Neira Ayuso Change-Id: Ic8c3d994ca89dfe9d031a0f111496ff031c2368e --- net/ipv4/netfilter/Kconfig | 3 ++- net/ipv6/netfilter/Kconfig | 3 ++- net/netfilter/Kconfig | 14 +++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4c019d5c3f5759..a6847b99586acc 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -182,7 +182,8 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE || IP_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6af874fc187f64..5ea61508bc6a83 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -173,7 +173,8 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP6_NF_MANGLE || IP6_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6b099d15585e97..fdf41c5e50ea29 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -198,7 +198,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -705,7 +705,7 @@ config NETFILTER_XT_TARGET_HL config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED ---help--- This option adds the "HMARK" target. @@ -846,7 +846,7 @@ config NETFILTER_XT_TARGET_REDIRECT config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK ---help--- This option adds a "TEE" target with which a packet can be cloned and @@ -856,7 +856,7 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES @@ -895,7 +895,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -1107,7 +1107,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -1391,7 +1391,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help From c754cf53ff1b9ee932f5f68d64d262425ee57463 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:46:24 -0400 Subject: [PATCH 034/420] BACKPORT: net: fix infoleak in rtnetlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 5f8e44741f9f216e33736ea4ec65ca9ac03036e6) The stack object “map” has a total size of 32 bytes. Its last 4 bytes are padding generated by compiler. These padding bytes are not initialized and sent out via “nla_put”. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller Bug: 28620102 Change-Id: If3f19c8ec398adc5ef6fc779328141e2de87772e --- net/core/rtnetlink.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 76321ea442c3e0..73d14f32f55ad5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1018,14 +1018,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) goto nla_put_failure; } From aa498f1e3fa3050c6be057dbc70ac66a66014d64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 19:37:57 -0800 Subject: [PATCH 035/420] BACKPORT: ipv6: add complete rcu protection around np->opt (cherry pick from commit 45f6fad84cc305103b28d73482b344d7f5b76f39) This patch addresses multiple problems : UDP/RAW sendmsg() need to get a stable struct ipv6_txoptions while socket is not locked : Other threads can change np->opt concurrently. Dmitry posted a syzkaller (http://github.com/google/syzkaller) program desmonstrating use-after-free. Starting with TCP/DCCP lockless listeners, tcp_v6_syn_recv_sock() and dccp_v6_request_recv_sock() also need to use RCU protection to dereference np->opt once (before calling ipv6_dup_options()) This patch adds full RCU protection to np->opt Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Bug: 28746669 Change-Id: I9654ba44a4a710a08f8b23bfd8205d205a95607c --- include/linux/ipv6.h | 2 +- include/net/ipv6.h | 21 ++++++++++++++++++- net/dccp/ipv6.c | 33 ++++++++++++++++++----------- net/ipv6/af_inet6.c | 13 ++++++++---- net/ipv6/datagram.c | 4 +++- net/ipv6/exthdrs.c | 3 ++- net/ipv6/inet6_connection_sock.c | 11 +++++++--- net/ipv6/ipv6_sockglue.c | 36 +++++++++++++++++++++----------- net/ipv6/raw.c | 8 +++++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 28 +++++++++++++++---------- net/ipv6/udp.c | 8 +++++-- net/l2tp/l2tp_ip6.c | 8 +++++-- 13 files changed, 124 insertions(+), 53 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 10e14afc7cf732..617022114d65d5 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -215,7 +215,7 @@ struct ipv6_pinfo { struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; - struct ipv6_txoptions *opt; + struct ipv6_txoptions __rcu *opt; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct { diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4292929392b012..f8047976caf461 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -207,6 +207,7 @@ extern rwlock_t ip6_ra_lock; */ struct ipv6_txoptions { + atomic_t refcnt; /* Length of this structure */ int tot_len; @@ -219,7 +220,7 @@ struct ipv6_txoptions { struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; - + struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; @@ -252,6 +253,24 @@ struct ipv6_fl_socklist { struct rcu_head rcu; }; +static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) +{ + struct ipv6_txoptions *opt; + + rcu_read_lock(); + opt = rcu_dereference(np->opt); + if (opt && !atomic_inc_not_zero(&opt->refcnt)) + opt = NULL; + rcu_read_unlock(); + return opt; +} + +static inline void txopt_put(struct ipv6_txoptions *opt) +{ + if (opt && atomic_dec_and_test(&opt->refcnt)) + kfree_rcu(opt, rcu); +} + struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6bcaa33cd804d2..7bcb223178415e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -238,7 +238,9 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -255,7 +257,10 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; - err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -450,6 +455,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; @@ -573,13 +579,15 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - if (np->opt != NULL) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt != NULL) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; dccp_sync_mss(newsk, dst_mtu(dst)); @@ -832,6 +840,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -933,7 +942,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -953,9 +963,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, __ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; - if (np->opt != NULL) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; inet->inet_dport = usin->sin6_port; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4c591c0aa3eec5..bf226f74d16273 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -441,9 +441,11 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ - opt = xchg(&np->opt, NULL); - if (opt != NULL) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } } EXPORT_SYMBOL_GPL(inet6_destroy_sock); @@ -690,7 +692,10 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.flowi6_uid = sock_i_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), + &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index be3df2d5c90c4b..7fe2c55cc7b405 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -168,8 +168,10 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - opt = flowlabel ? flowlabel->opt : np->opt; + rcu_read_lock(); + opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); err = 0; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index bfde361b613400..4f08a0f452eb2c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; + atomic_set(&opt2->refcnt, 1); } return opt2; } @@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); - + atomic_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index c8f730266b705f..ca655c93bd7989 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -77,7 +77,9 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = IPPROTO_TCP; fl6->daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; @@ -210,7 +212,9 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->flowi6_uid = sock_i_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { @@ -243,7 +247,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e1a9583bb4191f..f81fcc09ea6c86 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -110,10 +110,12 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } - opt = xchg(&inet6_sk(sk)->opt, opt); + opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, + opt); } else { spin_lock(&sk->sk_dst_lock); - opt = xchg(&inet6_sk(sk)->opt, opt); + opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, + opt); spin_unlock(&sk->sk_dst_lock); } sk_dst_reset(sk); @@ -213,9 +215,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, + NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); @@ -385,7 +390,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; - opt = ipv6_renew_options(sk, np->opt, optname, + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = ipv6_renew_options(sk, opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); if (IS_ERR(opt)) { @@ -414,8 +420,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; opt = ipv6_update_options(sk, opt); sticky_done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } @@ -468,6 +476,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; memset(opt, 0, sizeof(*opt)); + atomic_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_user(opt+1, optval, optlen)) @@ -484,8 +493,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; opt = ipv6_update_options(sk, opt); done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } case IPV6_UNICAST_HOPS: @@ -1092,10 +1103,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDR: case IPV6_DSTOPTS: { + struct ipv6_txoptions *opt; lock_sock(sk); - len = ipv6_getsockopt_sticky(sk, np->opt, - optname, optval, len); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e332cf1e8b415d..bc3459d49e6d8c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -735,6 +735,7 @@ static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg) static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; @@ -841,8 +842,10 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -903,6 +906,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: dst_confirm(dst); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 088a6b840dca4b..b52a246d73b3e1 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -241,7 +241,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = ireq->ir_mark; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4b0090f2aaa9a3..82344190de0021 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -134,6 +134,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct rt6_info *rt; struct flowi6 fl6; struct dst_entry *dst; @@ -254,7 +255,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sock_i_uid(sk); - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -283,9 +285,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; - if (np->opt) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -502,7 +504,8 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); skb_set_queue_mapping(skb, queue_mapping); - err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), + np->tclass); err = net_xmit_eval(err); } @@ -1052,6 +1055,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; @@ -1192,13 +1196,15 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, but we make one more one thing there: reattach optmem to newsk. */ - if (np->opt) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 71e7acf6dc9f25..ab9d273efb41be 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1082,6 +1082,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; + struct ipv6_txoptions *opt_to_free = NULL; struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; @@ -1235,8 +1236,10 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, opt = NULL; connected = 0; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -1330,6 +1333,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, out: dst_release(dst); fl6_sock_release(flowlabel); + txopt_put(opt_to_free); if (!err) return len; /* diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0edb263cc002e8..38658826175ca6 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -487,6 +487,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; @@ -576,8 +577,10 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, opt = NULL; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -632,6 +635,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; From dde45ad805c19a442c6e994c3936ea88f5b0d001 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Mar 2015 17:41:43 +0000 Subject: [PATCH 036/420] UPSTREAM: net: validate the range we feed to iov_iter_init() in sys_sendto/sys_recvfrom (cherry pick from commit 4de930efc23b92ddf88ce91c405ee645fe6e27ea) Cc: stable@vger.kernel.org # v3.19 Signed-off-by: Al Viro Signed-off-by: David S. Miller Bug: 28759139 Change-Id: I556eab62bc545f4382f93d0c721df342bbe76787 --- net/socket.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/socket.c b/net/socket.c index fe20c319a0bb37..1de8b265e2bc64 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1793,6 +1793,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (len > INT_MAX) len = INT_MAX; + if (unlikely(!access_ok(VERIFY_READ, buff, len))) + return -EFAULT; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -1852,6 +1854,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, if (size > INT_MAX) size = INT_MAX; + if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size))) + return -EFAULT; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; From f7f4e84d034af78e863cc3d7b172ab5c6cd5bb81 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:07 -0400 Subject: [PATCH 037/420] UPSTREAM: ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit cec8f96e49d9be372fdb0c3836dcf31ec71e457e) The stack object “tread” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980557 Change-Id: I963a8f5f7ae828787c655c9b89121d3844474513 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index f484c643646551..38698c61b4bc30 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1684,6 +1684,7 @@ static int snd_timer_user_params(struct file *file, if (tu->timeri->flags & SNDRV_TIMER_IFLG_EARLY_EVENT) { if (tu->tread) { struct snd_timer_tread tread; + memset(&tread, 0, sizeof(tread)); tread.event = SNDRV_TIMER_EVENT_EARLY; tread.tstamp.tv_sec = 0; tread.tstamp.tv_nsec = 0; From fe09e35726b85ebc0affacd5a69956d22f04b543 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:32 -0400 Subject: [PATCH 038/420] UPSTREAM: ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit e4ec8cc8039a7063e24204299b462bd1383184a5) The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980217 Change-Id: I756d05a328a133c1c67132301434c6817be0a2a6 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 38698c61b4bc30..de8c62d10ceefe 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1219,6 +1219,7 @@ static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri, } if ((tu->filter & (1 << SNDRV_TIMER_EVENT_RESOLUTION)) && tu->last_resolution != resolution) { + memset(&r1, 0, sizeof(r1)); r1.event = SNDRV_TIMER_EVENT_RESOLUTION; r1.tstamp = tstamp; r1.val = resolution; From 7dac6d1c17d68b0708ea03994ed4d3de914898ff Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:20 -0400 Subject: [PATCH 039/420] UPSTREAM: ALSA: timer: Fix leak in events via snd_timer_user_ccallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 9a47e9cff994f37f7f0dbd9ae23740d0f64f9fe6) The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980217 Change-Id: I0ba03af4d0620bcbc7a808d083295b7c97aba56d --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index de8c62d10ceefe..275b64bf5fd744 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1185,6 +1185,7 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri, tu->tstamp = *tstamp; if ((tu->filter & (1 << event)) == 0 || !tu->tread) return; + memset(&r1, 0, sizeof(r1)); r1.event = event; r1.tstamp = *tstamp; r1.val = resolution; From bf2958684cf7f8d6ca72e8abe4de62ccd4c2e329 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:32:16 -0400 Subject: [PATCH 040/420] UPSTREAM: USB: usbfs: fix potential infoleak in devio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 681fef8380eb818c0b845fca5d2ab1dcbab114ee) The stack object “ci” has a total size of 8 bytes. Its last 3 bytes are padding bytes which are not initialized and leaked to userland via “copy_to_user”. Signed-off-by: Kangjie Lu Signed-off-by: Greg Kroah-Hartman Bug: 28619695 Change-Id: I170754d659d0891c075f85211b5e3970b114f097 --- drivers/usb/core/devio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 0b59731c302133..c6b0894659499c 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1201,10 +1201,11 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg) static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg) { - struct usbdevfs_connectinfo ci = { - .devnum = ps->dev->devnum, - .slow = ps->dev->speed == USB_SPEED_LOW - }; + struct usbdevfs_connectinfo ci; + + memset(&ci, 0, sizeof(ci)); + ci.devnum = ps->dev->devnum; + ci.slow = ps->dev->speed == USB_SPEED_LOW; if (copy_to_user(arg, &ci, sizeof(ci))) return -EFAULT; From d859cfb2086d5d5f769d1dd040bb7447bba0f7da Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 27 Jun 2016 11:15:19 +0530 Subject: [PATCH 041/420] ANDROID: usb: gadget: fix skip Host MAC address reassignment Change-Id: I43add9925e9d6d90c56cffbd3ed999104448f818 to skip reassignment of Host MAC address at the time of reconnection, is broken on newer (>3.10) AOSP/kernel/common trees. It updates deprecated gether_setup_name() which is no longer used by RNDIS. gether_setup_name_default() function need to be updated instead. Fixes: cc0be22f98d1 ("usb: gadget: prevent change of Host MAC address of 'usb0' interface") Signed-off-by: Amit Pundir --- drivers/usb/gadget/function/u_ether.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 715ae93921735f..26f1a5a819f08f 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -1025,8 +1025,10 @@ struct net_device *gether_setup_name_default(const char *netname) eth_random_addr(dev->dev_mac); pr_warn("using random %s ethernet address\n", "self"); - eth_random_addr(dev->host_mac); - pr_warn("using random %s ethernet address\n", "host"); + if (get_host_ether_addr(host_ethaddr, dev->host_mac)) + pr_warn("using random %s ethernet address\n", "host"); + else + pr_warn("using previous %s ethernet address\n", "host"); net->netdev_ops = ð_netdev_ops; From 66861a10c08913cb080497e4fc0169bbf512ca88 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 22 Jun 2016 16:49:48 +0800 Subject: [PATCH 042/420] netfilter: xt_quota2: make quota2_log work well In upstream commit 7200135bc1e61f1437dc326ae2ef2f310c50b4eb (netfilter: kill ulog targets) http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7200135bc1e6 ipt_ULOG target was removed, meanwhile, the IP_NF_TARGET_ULOG Kconfig and ipt_ULOG.h header file were removed too. This causes we cannot enable QUOTA2_LOG, and netd complains this error: "Unable to open quota socket". So when we reach the quota2 limit, userspace will not be notified with this event. Since IP_NF_TARGET_ULOG was removed, we need not depend on "IP_NF_TARGET_ULOG=n", and for compatibility, add ulog_packet_msg_t related definitions copied from "ipt_ULOG.h". Change-Id: I38132efaabf52bea75dfd736ce734a1b9690e87e Reported-by: Samboo Shen Signed-off-by: Liping Zhang --- net/netfilter/Kconfig | 1 - net/netfilter/xt_quota2.c | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index fdf41c5e50ea29..cd5511ada239ef 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1331,7 +1331,6 @@ config NETFILTER_XT_MATCH_QUOTA2 config NETFILTER_XT_MATCH_QUOTA2_LOG bool '"quota2" Netfilter LOG support' depends on NETFILTER_XT_MATCH_QUOTA2 - depends on IP_NF_TARGET_ULOG=n # not yes, not module, just no default n help This option allows `quota2' to log ONCE when a quota limit diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c index 99592ae56d9b0d..834594aa0085fd 100644 --- a/net/netfilter/xt_quota2.c +++ b/net/netfilter/xt_quota2.c @@ -21,8 +21,27 @@ #include #include + #ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG -#include +/* For compatibility, these definitions are copied from the + * deprecated header file */ +#define ULOG_MAC_LEN 80 +#define ULOG_PREFIX_LEN 32 + +/* Format of the ULOG packets passed through netlink */ +typedef struct ulog_packet_msg { + unsigned long mark; + long timestamp_sec; + long timestamp_usec; + unsigned int hook; + char indev_name[IFNAMSIZ]; + char outdev_name[IFNAMSIZ]; + size_t data_len; + char prefix[ULOG_PREFIX_LEN]; + unsigned char mac_len; + unsigned char mac[ULOG_MAC_LEN]; + unsigned char payload[0]; +} ulog_packet_msg_t; #endif /** From 1d647c69aea93847db71fa043c053278ebfb8b26 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 17 Jun 2016 18:54:35 -0700 Subject: [PATCH 043/420] ANDROID: dm: mount as linear target if eng build eng builds dont have verity enabled i.e it does even have verity metadata appended to the parition. Therefore add rootdev as linear device and map the entire partition if build variant is "eng". (Cherry-picked based on https://partner-android-review.git.corp.google.com/#/c/618690/) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I8f5c2289b842b820ca04f5773525e5449bb3f355 --- drivers/md/dm-android-verity.c | 62 +++++++++++++++++++++++++++++++--- drivers/md/dm-android-verity.h | 1 + 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 097fb2b1de8917..e1a8e284e7e47c 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -44,6 +44,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; +static char buildvariant[BUILD_VARIANT]; static bool target_added; static bool verity_enabled = true; @@ -88,11 +89,26 @@ static int __init verity_keyid_param(char *line) __setup("veritykeyid=", verity_keyid_param); +static int __init verity_buildvariant(char *line) +{ + strlcpy(buildvariant, line, sizeof(buildvariant)); + return 1; +} + +__setup("buildvariant=", verity_buildvariant); + static inline bool default_verity_key_id(void) { return veritykeyid[0] != '\0'; } +static inline bool is_eng(void) +{ + static const char typeeng[] = "eng"; + + return !strncmp(buildvariant, typeeng, sizeof(typeeng)); +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -262,7 +278,7 @@ static int extract_fec_header(dev_t dev, struct fec_header *fec, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("bdev get error"); return PTR_ERR(bdev); } @@ -323,6 +339,24 @@ static void find_metadata_offset(struct fec_header *fec, *metadata_offset = device_size - VERITY_METADATA_SIZE; } +static int find_size(dev_t dev, u64 *device_size) +{ + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR_OR_NULL(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return PTR_ERR(bdev); + } + + *device_size = i_size_read(bdev->bd_inode); + *device_size >>= SECTOR_SHIFT; + + DMINFO("blkdev size in sectors: %llu", *device_size); + blkdev_put(bdev, FMODE_READ); + return 0; +} + static struct android_metadata *extract_metadata(dev_t dev, struct fec_header *fec) { @@ -337,7 +371,7 @@ static struct android_metadata *extract_metadata(dev_t dev, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); return ERR_CAST(bdev); } @@ -632,12 +666,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; + u64 device_size; if (argc == 1) { /* Use the default keyid */ if (default_verity_key_id()) key_id = veritykeyid; - else { + else if (!is_eng()) { DMERR("veritykeyid= is not set"); handle_error(); return -EINVAL; @@ -650,7 +685,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - strreplace(key_id, '#', ' '); target_device = argv[0]; dev = name_to_dev_t(target_device); @@ -660,6 +694,26 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } + if (is_eng()) { + err = find_size(dev, &device_size); + if (err) { + DMERR("error finding bdev size"); + handle_error(); + return err; + } + + ti->len = device_size; + err = add_as_linear_device(ti, target_device); + if (err) { + handle_error(); + return err; + } + verity_enabled = false; + return 0; + } + + strreplace(key_id, '#', ' '); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 52c48df94c0509..f43b02fbb4759c 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -26,6 +26,7 @@ #define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +#define BUILD_VARIANT 20 /* * : is the format for the identifier. From 1cf6013fbab50886d4cc1512f3668942ebc07b4c Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 27 Jun 2016 16:25:55 -0700 Subject: [PATCH 044/420] ANDROID: dm: allow adb disable-verity only in userdebug adb disable-verity was allowed when the phone is in the unlocked state. Since the driver is now aware of the build variant, honor "adb disable-verity" only in userdebug builds. (Cherry-picked from https://partner-android-review.git.corp.google.com/#/c/622117) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I7ce9f38d8c7a62361392c5a8ccebb288f8a3a2ea --- drivers/md/dm-android-verity.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index e1a8e284e7e47c..999e75bf2ba00f 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -109,6 +109,14 @@ static inline bool is_eng(void) return !strncmp(buildvariant, typeeng, sizeof(typeeng)); } +static inline bool is_userdebug(void) +{ + static const char typeuserdebug[] = "userdebug"; + + return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug)); +} + + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -499,19 +507,6 @@ const char *find_dt_value(const char *name) return value; } -static bool is_unlocked(void) -{ - static const char unlocked[] = "orange"; - static const char verified_boot_prop[] = "verifiedbootstate"; - const char *value; - - value = find_dt_value(verified_boot_prop); - if (!value) - value = verifiedbootstate; - - return !strncmp(value, unlocked, sizeof(unlocked) - 1); -} - static int verity_mode(void) { static const char enforcing[] = "enforcing"; @@ -531,7 +526,7 @@ static int verify_header(struct android_metadata_header *header) { int retval = -EINVAL; - if (is_unlocked() && le32_to_cpu(header->magic_number) == + if (is_userdebug() && le32_to_cpu(header->magic_number) == VERITY_METADATA_MAGIC_DISABLE) { retval = VERITY_STATE_DISABLE; return retval; From 5eb71fd923dc493d4a36f6f09f3066bfe05ee390 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Jun 2016 13:35:38 -0700 Subject: [PATCH 045/420] UPSTREAM: udp: fix behavior of wrong checksums (cherry pick from commit beb39db59d14990e401e235faf66a6b9b31240b0) We have two problems in UDP stack related to bogus checksums : 1) We return -EAGAIN to application even if receive queue is not empty. This breaks applications using edge trigger epoll() 2) Under UDP flood, we can loop forever without yielding to other processes, potentially hanging the host, especially on non SMP. This patch is an attempt to make things better. We might in the future add extra support for rt applications wanting to better control time spent doing a recv() in a hostile environment. For example we could validate checksums before queuing packets in socket receive queue. Bug: 29507402 Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++---- net/ipv6/udp.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3d63425f0faa91..dc154ba286d9a7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1337,10 +1337,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ab9d273efb41be..b6240cc999e113 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -514,10 +514,8 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } From d02694ed0b05a174469a042839f048c99bc292cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 8 Jul 2016 11:17:31 -0700 Subject: [PATCH 046/420] UPSTREAM: cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 4d06dd537f95683aba3651098ae288b7cbff8274) usbnet_link_change will call schedule_work and should be avoided if bind is failing. Otherwise we will end up with scheduled work referring to a netdev which has gone away. Instead of making the call conditional, we can just defer it to usbnet_probe, using the driver_info flag made for this purpose. Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change") Reported-by: Andrey Konovalov Suggested-by: Linus Torvalds Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller Change-Id: Id9a6d02bdd98bf495d26595cf2cc90e480746186 Bug: 28744625 --- drivers/net/usb/cdc_ncm.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 80a844e0ae0383..91b91a60f9e4f0 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -948,23 +948,12 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) { - int ret; - /* MBIM backwards compatible function? */ if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM) return -ENODEV; /* The NCM data altsetting is fixed */ - ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM); - - /* - * We should get an event when network connection is "connected" or - * "disconnected". Set network connection in "disconnected" state - * (carrier is OFF) during attach, so the IP network stack does not - * start IPv6 negotiation and more. - */ - usbnet_link_change(dev, 0, 0); - return ret; + return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM); } static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max) @@ -1506,7 +1495,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", - .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET, + .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET + | FLAG_LINK_INTR, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1519,7 +1509,7 @@ static const struct driver_info cdc_ncm_info = { static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN, + | FLAG_LINK_INTR | FLAG_WWAN, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1532,7 +1522,7 @@ static const struct driver_info wwan_info = { static const struct driver_info wwan_noarp_info = { .description = "Mobile Broadband Network Device (NO ARP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN | FLAG_NOARP, + | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, From cd9cd935b7c6c353afcacd9d52479b9768611c5a Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 8 Jul 2016 14:15:14 -0700 Subject: [PATCH 047/420] sdcardfs: Truncate packages_gid.list on overflow packages_gid.list was improperly returning the wrong count. Use scnprintf instead, and inform the user that the list was truncated if it is. Bug: 30013843 Change-Id: Ida2b2ef7cd86dd87300bfb4c2cdb6bfe2ee1650d Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/packagelist.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index fead71eac95f29..10b98b1548eadb 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -349,13 +349,20 @@ static ssize_t packages_attr_show(struct config_item *item, struct hashtable_entry *hash_cur; struct hlist_node *h_t; int i; - int count = 0; + int count = 0, written = 0; + char errormsg[] = "\n"; + mutex_lock(&pkgl_data_all->hashtable_lock); - hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) - count += snprintf(page + count, PAGE_SIZE - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); + hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) { + written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); + if (count + written == PAGE_SIZE - sizeof(errormsg)) { + count += scnprintf(page + count, PAGE_SIZE - count, errormsg); + break; + } + count += written; + } mutex_unlock(&pkgl_data_all->hashtable_lock); - return count; } From a2edb9ba59210b30d07819c6b76120fc5a6e9079 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 12 Jun 2016 17:37:52 -0700 Subject: [PATCH 048/420] android-recommended.cfg: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Bug: 28967314 Change-Id: I588c2d544250e9e4b5082b43c237b8f85b7313ca Signed-off-by: Jeff Vander Stoep --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 8ff85694ed54cd..f4184955ad57ef 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -13,6 +13,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y CONFIG_DM_ANDROID_VERITY=y From 0c916e244ab3b13a29c4949c5c5c643d92f77428 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 6 Jul 2016 17:16:19 -0700 Subject: [PATCH 049/420] ANDROID: dm: android-verity: Verify header before fetching table Move header validation logic before reading the verity_table as an invalid header implies the table is invalid as well. (Cherry-picked from: https://partner-android-review.git.corp.google.com/#/c/625203) BUG: 29940612 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ib34d25c0854202f3e70df0a6d0ef1d96f0250c8e --- drivers/md/dm-android-verity.c | 140 +++++++++++++++++---------------- 1 file changed, 71 insertions(+), 69 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 999e75bf2ba00f..1f4eb099209db7 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -365,12 +365,38 @@ static int find_size(dev_t dev, u64 *device_size) return 0; } -static struct android_metadata *extract_metadata(dev_t dev, - struct fec_header *fec) +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_userdebug() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) + return VERITY_STATE_DISABLE; + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int extract_metadata(dev_t dev, struct fec_header *fec, + struct android_metadata **metadata, + bool *verity_enabled) { struct block_device *bdev; struct android_metadata_header *header; - struct android_metadata *uninitialized_var(metadata); int i; u32 table_length, copy_length, offset; u64 metadata_offset; @@ -381,7 +407,7 @@ static struct android_metadata *extract_metadata(dev_t dev, if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); - return ERR_CAST(bdev); + return -ENODEV; } find_metadata_offset(fec, bdev, &metadata_offset); @@ -399,7 +425,6 @@ static struct android_metadata *extract_metadata(dev_t dev, (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); if (err) { DMERR("Error while reading verity metadata"); - metadata = ERR_PTR(err); goto blkdev_release; } @@ -418,24 +443,42 @@ static struct android_metadata *extract_metadata(dev_t dev, le32_to_cpu(header->protocol_version), le32_to_cpu(header->table_length)); - metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); - if (!metadata) { + err = verify_header(header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + *verity_enabled = false; + /* we would still have to read the metadata to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + * + * Reset error as well as the verity_enabled flag is changed. + */ + err = 0; + } else if (err) + goto free_header; + + *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL); + if (!*metadata) { DMERR("kzalloc for metadata failed"); err = -ENOMEM; goto free_header; } - metadata->header = header; + (*metadata)->header = header; table_length = le32_to_cpu(header->table_length); if (table_length == 0 || table_length > (VERITY_METADATA_SIZE - - sizeof(struct android_metadata_header))) + sizeof(struct android_metadata_header))) { + DMERR("table_length too long"); + err = -EINVAL; goto free_metadata; + } - metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL); - if (!metadata->verity_table) { + if (!(*metadata)->verity_table) { DMERR("kzalloc verity_table failed"); err = -ENOMEM; goto free_metadata; @@ -443,13 +486,15 @@ static struct android_metadata *extract_metadata(dev_t dev, if (sizeof(struct android_metadata_header) + table_length <= PAGE_SIZE) { - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), table_length); } else { copy_length = PAGE_SIZE - sizeof(struct android_metadata_header); - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), copy_length); table_length -= copy_length; @@ -457,13 +502,13 @@ static struct android_metadata *extract_metadata(dev_t dev, i = 1; while (table_length != 0) { if (table_length > PAGE_SIZE) { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), PAGE_SIZE); offset += PAGE_SIZE; table_length -= PAGE_SIZE; } else { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), table_length); table_length = 0; @@ -471,25 +516,23 @@ static struct android_metadata *extract_metadata(dev_t dev, i++; } } - metadata->verity_table[table_length] = '\0'; + (*metadata)->verity_table[table_length] = '\0'; + DMINFO("verity_table: %s", (*metadata)->verity_table); goto free_payload; free_metadata: - kfree(metadata); + kfree(*metadata); free_header: kfree(header); - metadata = ERR_PTR(err); free_payload: for (i = 0; i < payload.number_of_pages; i++) if (payload.page_io[i]) __free_page(payload.page_io[i]); kfree(payload.page_io); - - DMINFO("verity_table: %s", metadata->verity_table); blkdev_release: blkdev_put(bdev, FMODE_READ); - return metadata; + return err; } /* helper functions to extract properties from dts */ @@ -522,34 +565,6 @@ static int verity_mode(void) return DM_VERITY_MODE_EIO; } -static int verify_header(struct android_metadata_header *header) -{ - int retval = -EINVAL; - - if (is_userdebug() && le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE) { - retval = VERITY_STATE_DISABLE; - return retval; - } - - if (!(le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_NUMBER) || - (le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE)) { - DMERR("Incorrect magic number"); - return retval; - } - - if (le32_to_cpu(header->protocol_version) != - VERITY_METADATA_VERSION) { - DMERR("Unsupported version %u", - le32_to_cpu(header->protocol_version)); - return retval; - } - - return 0; -} - static int verify_verity_signature(char *key_id, struct android_metadata *metadata) { @@ -649,7 +664,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { dev_t uninitialized_var(dev); - struct android_metadata *uninitialized_var(metadata); + struct android_metadata *metadata = NULL; int err = 0, i, mode; char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; @@ -717,26 +732,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - metadata = extract_metadata(dev, &fec); + err = extract_metadata(dev, &fec, &metadata, &verity_enabled); - if (IS_ERR(metadata)) { + if (err) { DMERR("Error while extracting metadata"); handle_error(); - return -EINVAL; - } - - err = verify_header(metadata->header); - - if (err == VERITY_STATE_DISABLE) { - DMERR("Mounting root with verity disabled"); - verity_enabled = false; - /* we would still have to parse the args to figure out - * the data blocks size. Or may be could map the entire - * partition similar to mounting the device. - */ - } else if (err) { - DMERR("Verity header handle error"); - handle_error(); goto free_metadata; } @@ -869,8 +869,10 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } free_metadata: - kfree(metadata->header); - kfree(metadata->verity_table); + if (metadata) { + kfree(metadata->header); + kfree(metadata->verity_table); + } kfree(metadata); return err; } From 4271c9e5b722a7071f7c02297074c98064d79de2 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 8 Jul 2016 16:02:12 -0700 Subject: [PATCH 050/420] UPSTREAM: KEYS: close race between key lookup and freeing (cherry pick from commit a3a8784454692dd72e5d5d34dcdab17b4420e74c) When a key is being garbage collected, it's key->user would get put before the ->destroy() callback is called, where the key is removed from it's respective tracking structures. This leaves a key hanging in a semi-invalid state which leaves a window open for a different task to try an access key->user. An example is find_keyring_by_name() which would dereference key->user for a key that is in the process of being garbage collected (where key->user was freed but ->destroy() wasn't called yet - so it's still present in the linked list). This would cause either a panic, or corrupt memory. Fixes CVE-2014-9529. Signed-off-by: Sasha Levin Signed-off-by: David Howells Change-Id: I23f34be2a0b97de5ee38a66729888d63f3d60c88 Bug: 29510361 --- security/keys/gc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/gc.c b/security/keys/gc.c index 9609a7f0faea2d..c7952375ac5325 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -148,12 +148,12 @@ static noinline void key_gc_unused_keys(struct list_head *keys) if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) atomic_dec(&key->user->nikeys); - key_user_put(key->user); - /* now throw away the key memory */ if (key->type->destroy) key->type->destroy(key); + key_user_put(key->user); + kfree(key->description); #ifdef KEY_DEBUGGING From 3cb3e301c3d36e1bea28f69cbc9629572b527dd7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jun 2016 10:28:49 -0700 Subject: [PATCH 051/420] ANDROID: sdcardfs: fix itnull.cocci warnings List_for_each_entry has the property that the first argument is always bound to a real list element, never NULL, so testing dentry is not needed. Generated by: scripts/coccinelle/iterators/itnull.cocci Change-Id: I51033a2649eb39451862b35b6358fe5cfe25c5f5 Cc: Daniel Rosenberg Signed-off-by: Julia Lawall Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- fs/sdcardfs/derived_perm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 9de45bc54f0eb1..903e89068170f3 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -112,7 +112,7 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) void get_derive_permissions_recursive(struct dentry *parent) { struct dentry *dentry; list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) { - if (dentry && dentry->d_inode) { + if (dentry->d_inode) { mutex_lock(&dentry->d_inode->i_mutex); get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); From 62882e757d95076bbd14371ebfaf1246f0191816 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 11 Jul 2016 14:18:11 -0700 Subject: [PATCH 052/420] UPSTREAM: ASN.1: Fix non-match detection failure on data overrun (cherry pick from commit 0d62e9dd6da45bbf0f33a8617afc5fe774c8f45f) If the ASN.1 decoder is asked to parse a sequence of objects, non-optional matches get skipped if there's no more data to be had rather than a data-overrun error being reported. This is due to the code segment that decides whether to skip optional matches (ie. matches that could get ignored because an element is marked OPTIONAL in the grammar) due to a lack of data also skips non-optional elements if the data pointer has reached the end of the buffer. This can be tested with the data decoder for the new RSA akcipher algorithm that takes three non-optional integers. Currently, it skips the last integer if there is insufficient data. Without the fix, #defining DEBUG in asn1_decoder.c will show something like: next_op: pc=0/13 dp=0/270 C=0 J=0 - match? 30 30 00 - TAG: 30 266 CONS next_op: pc=2/13 dp=4/270 C=1 J=0 - match? 02 02 00 - TAG: 02 257 - LEAF: 257 next_op: pc=5/13 dp=265/270 C=1 J=0 - match? 02 02 00 - TAG: 02 3 - LEAF: 3 next_op: pc=8/13 dp=270/270 C=1 J=0 next_op: pc=11/13 dp=270/270 C=1 J=0 - end cons t=4 dp=270 l=270/270 The next_op line for pc=8/13 should be followed by a match line. This is not exploitable for X.509 certificates by means of shortening the message and fixing up the ASN.1 CONS tags because: (1) The relevant records being built up are cleared before use. (2) If the message is shortened sufficiently to remove the public key, the ASN.1 parse of the RSA key will fail quickly due to a lack of data. (3) Extracted signature data is either turned into MPIs (which cope with a 0 length) or is simpler integers specifying algoritms and suchlike (which can validly be 0); and (4) The AKID and SKID extensions are optional and their removal is handled without risking passing a NULL to asymmetric_key_generate_id(). (5) If the certificate is truncated sufficiently to remove the subject, issuer or serialNumber then the ASN.1 decoder will fail with a 'Cons stack underflow' return. This is not exploitable for PKCS#7 messages by means of removal of elements from such a message from the tail end of a sequence: (1) Any shortened X.509 certs embedded in the PKCS#7 message are survivable as detailed above. (2) The message digest content isn't used if it shows a NULL pointer, similarly, the authattrs aren't used if that shows a NULL pointer. (3) A missing signature results in a NULL MPI - which the MPI routines deal with. (4) If data is NULL, it is expected that the message has detached content and that is handled appropriately. (5) If the serialNumber is excised, the unconditional action associated with it will pick up the containing SEQUENCE instead, so no NULL pointer will be seen here. If both the issuer and the serialNumber are excised, the ASN.1 decode will fail with an 'Unexpected tag' return. In either case, there's no way to get to asymmetric_key_generate_id() with a NULL pointer. (6) Other fields are decoded to simple integers. Shortening the message to omit an algorithm ID field will cause checks on this to fail early in the verification process. This can also be tested by snipping objects off of the end of the ASN.1 stream such that mandatory tags are removed - or even from the end of internal SEQUENCEs. If any mandatory tag is missing, the error EBADMSG *should* be produced. Without this patch ERANGE or ENOPKG might be produced or the parse may apparently succeed, perhaps with ENOKEY or EKEYREJECTED being produced later, depending on what gets snipped. Just snipping off the final BIT_STRING or OCTET_STRING from either sample should be a start since both are mandatory and neither will cause an EBADMSG without the patches Reported-by: Marcel Holtmann Signed-off-by: David Howells Tested-by: Marcel Holtmann Reviewed-by: David Woodhouse Change-Id: I4f6003fade25d8c77baafdff3af084c739efa69c Bug: 28751627 --- lib/asn1_decoder.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 1a000bb050f9f9..d60ce8a5365036 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -208,9 +208,8 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder, unsigned char tmp; /* Skip conditional matches if possible */ - if ((op & ASN1_OP_MATCH__COND && - flags & FLAG_MATCHED) || - dp == datalen) { + if ((op & ASN1_OP_MATCH__COND && flags & FLAG_MATCHED) || + (op & ASN1_OP_MATCH__SKIP && dp == datalen)) { pc += asn1_op_lengths[op]; goto next_op; } From 1b0ca7a1be5dd57531a1f1068f85c8f4e6f93ae1 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 11 Jul 2016 15:20:43 -0700 Subject: [PATCH 053/420] UPSTREAM: ipv4: try to cache dst_entries which would cause a redirect (cherry pick from commit df4d92549f23e1c037e83323aff58a21b3de7fe0) Not caching dst_entries which cause redirects could be exploited by hosts on the same subnet, causing a severe DoS attack. This effect aggravated since commit f88649721268999 ("ipv4: fix dst race in sk_dst_get()"). Lookups causing redirects will be allocated with DST_NOCACHE set which will force dst_release to free them via RCU. Unfortunately waiting for RCU grace period just takes too long, we can end up with >1M dst_entries waiting to be released and the system will run OOM. rcuos threads cannot catch up under high softirq load. Attaching the flag to emit a redirect later on to the specific skb allows us to cache those dst_entries thus reducing the pressure on allocation and deallocation. This issue was discovered by Marcelo Leitner. Cc: Julian Anastasov Signed-off-by: Marcelo Leitner Signed-off-by: Florian Westphal Signed-off-by: Hannes Frederic Sowa Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller Change-Id: I2928da29b79a94b60023a1c19e464eece0f1dc7a Bug: 29506807 --- include/net/ip.h | 11 ++++++----- net/ipv4/ip_forward.c | 3 ++- net/ipv4/route.c | 9 +++++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 2646bf948a150e..888be4aa7a4224 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -39,11 +39,12 @@ struct inet_skb_parm { struct ip_options opt; /* Compiled IP options */ unsigned char flags; -#define IPSKB_FORWARDED 1 -#define IPSKB_XFRM_TUNNEL_SIZE 2 -#define IPSKB_XFRM_TRANSFORMED 4 -#define IPSKB_FRAG_COMPLETE 8 -#define IPSKB_REROUTED 16 +#define IPSKB_FORWARDED BIT(0) +#define IPSKB_XFRM_TUNNEL_SIZE BIT(1) +#define IPSKB_XFRM_TRANSFORMED BIT(2) +#define IPSKB_FRAG_COMPLETE BIT(3) +#define IPSKB_REROUTED BIT(4) +#define IPSKB_DOREDIRECT BIT(5) u16 frag_max_size; }; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3a83ce5efa80e3..787b3c294ce672 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -129,7 +129,8 @@ int ip_forward(struct sk_buff *skb) * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ - if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) + if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr && + !skb_sec_path(skb)) ip_rt_send_redirect(skb); skb->priority = rt_tos2priority(iph->tos); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f3b28ccbcd4f32..6ac45cc66cbe19 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1556,11 +1556,10 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && + skb->protocol == htons(ETH_P_IP) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { - flags |= RTCF_DOREDIRECT; - do_cache = false; - } + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + IPCB(skb)->flags |= IPSKB_DOREDIRECT; if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -2305,6 +2304,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; + if (IPCB(skb)->flags & IPSKB_DOREDIRECT) + r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_be32(skb, RTA_DST, dst)) goto nla_put_failure; From a0a6fa706d7c3439646301a0d2eeb07646364e39 Mon Sep 17 00:00:00 2001 From: "D.S. Ljungmark" Date: Mon, 11 Jul 2016 16:52:44 -0700 Subject: [PATCH 054/420] UPSTREAM: ipv6: Don't reduce hop limit for an interface (cherry pick from commit 6fd99094de2b83d1d4c8457f2c83483b2828e75a) A local route may have a lower hop_limit set than global routes do. RFC 3756, Section 4.2.7, "Parameter Spoofing" > 1. The attacker includes a Current Hop Limit of one or another > small > number which the attacker knows will cause legitimate packets to > be dropped before they reach their destination. > As an example, one possible approach to mitigate this threat is to > ignore very small hop limits. The nodes could implement a > configurable minimum hop limit, and ignore attempts to set it below > said limit. Signed-off-by: D.S. Ljungmark Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Change-Id: I24ee5723e4bcb3fbdbf4308531ab58e9ff215e82 Bug: 29409847 --- net/ipv6/ndisc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 96744effa1680b..1c34d758b0c218 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1217,7 +1217,14 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt) rt6_set_expires(rt, jiffies + (HZ * lifetime)); if (ra_msg->icmph.icmp6_hop_limit) { - in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + /* Only set hop_limit on the interface if it is higher than + * the current hop_limit. + */ + if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) { + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + } else { + ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n"); + } if (rt) dst_metric_set(&rt->dst, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); From 3faa33e5b07b973aaf133b005eb5d5dd5d7232f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jul 2016 17:12:42 -0700 Subject: [PATCH 055/420] UPSTREAM: netfilter: x_tables: fix unconditional helper (cherry pick from commit 54d83fc74aa9ec72794373cb47432c5f7fb1a309) Ben Hawkes says: In the mark_source_chains function (net/ipv4/netfilter/ip_tables.c) it is possible for a user-supplied ipt_entry structure to have a large next_offset field. This field is not bounds checked prior to writing a counter value at the supplied offset. Problem is that mark_source_chains should not have been called -- the rule doesn't have a next entry, so its supposed to return an absolute verdict of either ACCEPT or DROP. However, the function conditional() doesn't work as the name implies. It only checks that the rule is using wildcard address matching. However, an unconditional rule must also not be using any matches (no -m args). The underflow validator only checked the addresses, therefore passing the 'unconditional absolute verdict' test, while mark_source_chains also tested for presence of matches, and thus proceeeded to the next (not-existent) rule. Unify this so that all the callers have same idea of 'unconditional rule'. Reported-by: Ben Hawkes Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: I47ec0713ac563ac244200c7b2c54f09a91aceabc Bug: 28940694 --- net/ipv4/netfilter/arp_tables.c | 18 +++++++++--------- net/ipv4/netfilter/ip_tables.c | 23 +++++++++++------------ net/ipv6/netfilter/ip6_tables.c | 23 +++++++++++------------ 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f95b6f93814b95..a12e24856cc7d6 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -355,11 +355,12 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline bool unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; - return memcmp(arp, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct arpt_entry) && + memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -398,11 +399,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->arp)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -547,7 +547,7 @@ static bool check_underflow(const struct arpt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->arp)) + if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -588,9 +588,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 99e810f84671bb..4636fd3ff49f04 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; - return memcmp(ip, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ipt_entry) && + memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ip)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -472,11 +472,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->ip)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -709,7 +708,7 @@ static bool check_underflow(const struct ipt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ip)) + if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -751,9 +750,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e080fbbbc0e5ce..415f1f027374af 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -195,11 +195,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ip6t_ip6 *ipv6) +static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; - return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ip6t_entry) && + memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * @@ -255,11 +256,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ip6t_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ipv6)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] @@ -482,11 +482,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ip6t_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && - unconditional(&e->ipv6)) || visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -720,7 +719,7 @@ static bool check_underflow(const struct ip6t_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ipv6)) + if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -762,9 +761,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; From 2627cc9ff5cbc6c6bae8cdd7f870132d9b029805 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jan 2016 09:35:51 -0800 Subject: [PATCH 056/420] BACKPORT: ipv6: tcp: add rcu locking in tcp_v6_send_synack() (cherry pick from commit 3e4006f0b86a5ae5eb0e8215f9a9e1db24506977) When first SYNACK is sent, we already hold rcu_read_lock(), but this is not true if a SYNACK is retransmitted, as a timer (soft) interrupt does not hold rcu_read_lock() Fixes: 45f6fad84cc30 ("ipv6: add complete rcu protection around np->opt") Reported-by: Dave Jones Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Fixes: Change-Id: I9654ba44a4a710a08f8b23bfd8205d205a95607c ("BACKPORT: ipv6: add complete rcu protection around np->opt") Signed-off-by: Amit Pundir --- net/ipv6/tcp_ipv6.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 82344190de0021..3e1466522055ca 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -504,8 +504,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); skb_set_queue_mapping(skb, queue_mapping); + rcu_read_lock(); err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } From 0433cb108bc229a51dc63e411f3f9e34158b5ca6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 23 Mar 2016 16:38:55 +0100 Subject: [PATCH 057/420] UPSTREAM: ppp: take reference on channels netns (cherry pick from commit 1f461dcdd296eecedaffffc6bae2bfa90bd7eb89) Let channels hold a reference on their network namespace. Some channel types, like ppp_async and ppp_synctty, can have their userspace controller running in a different namespace. Therefore they can't rely on them to preclude their netns from being removed from under them. ================================================================== BUG: KASAN: use-after-free in ppp_unregister_channel+0x372/0x3a0 at addr ffff880064e217e0 Read of size 8 by task syz-executor/11581 ============================================================================= BUG net_namespace (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in copy_net_ns+0x6b/0x1a0 age=92569 cpu=3 pid=6906 [< none >] ___slab_alloc+0x4c7/0x500 kernel/mm/slub.c:2440 [< none >] __slab_alloc+0x4c/0x90 kernel/mm/slub.c:2469 [< inline >] slab_alloc_node kernel/mm/slub.c:2532 [< inline >] slab_alloc kernel/mm/slub.c:2574 [< none >] kmem_cache_alloc+0x23a/0x2b0 kernel/mm/slub.c:2579 [< inline >] kmem_cache_zalloc kernel/include/linux/slab.h:597 [< inline >] net_alloc kernel/net/core/net_namespace.c:325 [< none >] copy_net_ns+0x6b/0x1a0 kernel/net/core/net_namespace.c:360 [< none >] create_new_namespaces+0x2f6/0x610 kernel/kernel/nsproxy.c:95 [< none >] copy_namespaces+0x297/0x320 kernel/kernel/nsproxy.c:150 [< none >] copy_process.part.35+0x1bf4/0x5760 kernel/kernel/fork.c:1451 [< inline >] copy_process kernel/kernel/fork.c:1274 [< none >] _do_fork+0x1bc/0xcb0 kernel/kernel/fork.c:1723 [< inline >] SYSC_clone kernel/kernel/fork.c:1832 [< none >] SyS_clone+0x37/0x50 kernel/kernel/fork.c:1826 [< none >] entry_SYSCALL_64_fastpath+0x16/0x7a kernel/arch/x86/entry/entry_64.S:185 INFO: Freed in net_drop_ns+0x67/0x80 age=575 cpu=2 pid=2631 [< none >] __slab_free+0x1fc/0x320 kernel/mm/slub.c:2650 [< inline >] slab_free kernel/mm/slub.c:2805 [< none >] kmem_cache_free+0x2a0/0x330 kernel/mm/slub.c:2814 [< inline >] net_free kernel/net/core/net_namespace.c:341 [< none >] net_drop_ns+0x67/0x80 kernel/net/core/net_namespace.c:348 [< none >] cleanup_net+0x4e5/0x600 kernel/net/core/net_namespace.c:448 [< none >] process_one_work+0x794/0x1440 kernel/kernel/workqueue.c:2036 [< none >] worker_thread+0xdb/0xfc0 kernel/kernel/workqueue.c:2170 [< none >] kthread+0x23f/0x2d0 kernel/drivers/block/aoe/aoecmd.c:1303 [< none >] ret_from_fork+0x3f/0x70 kernel/arch/x86/entry/entry_64.S:468 INFO: Slab 0xffffea0001938800 objects=3 used=0 fp=0xffff880064e20000 flags=0x5fffc0000004080 INFO: Object 0xffff880064e20000 @offset=0 fp=0xffff880064e24200 CPU: 1 PID: 11581 Comm: syz-executor Tainted: G B 4.4.0+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014 00000000ffffffff ffff8800662c7790 ffffffff8292049d ffff88003e36a300 ffff880064e20000 ffff880064e20000 ffff8800662c77c0 ffffffff816f2054 ffff88003e36a300 ffffea0001938800 ffff880064e20000 0000000000000000 Call Trace: [< inline >] __dump_stack kernel/lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 kernel/lib/dump_stack.c:50 [] print_trailer+0xf4/0x150 kernel/mm/slub.c:654 [] object_err+0x2f/0x40 kernel/mm/slub.c:661 [< inline >] print_address_description kernel/mm/kasan/report.c:138 [] kasan_report_error+0x215/0x530 kernel/mm/kasan/report.c:236 [< inline >] kasan_report kernel/mm/kasan/report.c:259 [] __asan_report_load8_noabort+0x3e/0x40 kernel/mm/kasan/report.c:280 [< inline >] ? ppp_pernet kernel/include/linux/compiler.h:218 [] ? ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [< inline >] ppp_pernet kernel/include/linux/compiler.h:218 [] ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [< inline >] ? ppp_pernet kernel/drivers/net/ppp/ppp_generic.c:293 [] ? ppp_unregister_channel+0xe6/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [] ppp_asynctty_close+0xa3/0x130 kernel/drivers/net/ppp/ppp_async.c:241 [] ? async_lcp_peek+0x5b0/0x5b0 kernel/drivers/net/ppp/ppp_async.c:1000 [] tty_ldisc_close.isra.1+0x99/0xe0 kernel/drivers/tty/tty_ldisc.c:478 [] tty_ldisc_kill+0x40/0x170 kernel/drivers/tty/tty_ldisc.c:744 [] tty_ldisc_release+0x1b3/0x260 kernel/drivers/tty/tty_ldisc.c:772 [] tty_release+0xac1/0x13e0 kernel/drivers/tty/tty_io.c:1901 [] ? release_tty+0x320/0x320 kernel/drivers/tty/tty_io.c:1688 [] __fput+0x236/0x780 kernel/fs/file_table.c:208 [] ____fput+0x15/0x20 kernel/fs/file_table.c:244 [] task_work_run+0x16b/0x200 kernel/kernel/task_work.c:115 [< inline >] exit_task_work kernel/include/linux/task_work.h:21 [] do_exit+0x8b5/0x2c60 kernel/kernel/exit.c:750 [] ? debug_check_no_locks_freed+0x290/0x290 kernel/kernel/locking/lockdep.c:4123 [] ? mm_update_next_owner+0x6f0/0x6f0 kernel/kernel/exit.c:357 [] ? __dequeue_signal+0x136/0x470 kernel/kernel/signal.c:550 [] ? recalc_sigpending_tsk+0x13b/0x180 kernel/kernel/signal.c:145 [] do_group_exit+0x108/0x330 kernel/kernel/exit.c:880 [] get_signal+0x5e4/0x14f0 kernel/kernel/signal.c:2307 [< inline >] ? kretprobe_table_lock kernel/kernel/kprobes.c:1113 [] ? kprobe_flush_task+0xb5/0x450 kernel/kernel/kprobes.c:1158 [] do_signal+0x83/0x1c90 kernel/arch/x86/kernel/signal.c:712 [] ? recycle_rp_inst+0x310/0x310 kernel/include/linux/list.h:655 [] ? setup_sigcontext+0x780/0x780 kernel/arch/x86/kernel/signal.c:165 [] ? finish_task_switch+0x424/0x5f0 kernel/kernel/sched/core.c:2692 [< inline >] ? finish_lock_switch kernel/kernel/sched/sched.h:1099 [] ? finish_task_switch+0x120/0x5f0 kernel/kernel/sched/core.c:2678 [< inline >] ? context_switch kernel/kernel/sched/core.c:2807 [] ? __schedule+0x919/0x1bd0 kernel/kernel/sched/core.c:3283 [] exit_to_usermode_loop+0xf1/0x1a0 kernel/arch/x86/entry/common.c:247 [< inline >] prepare_exit_to_usermode kernel/arch/x86/entry/common.c:282 [] syscall_return_slowpath+0x19f/0x210 kernel/arch/x86/entry/common.c:344 [] int_ret_from_sys_call+0x25/0x9f kernel/arch/x86/entry/entry_64.S:281 Memory state around the buggy address: ffff880064e21680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880064e21700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff880064e21780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff880064e21800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880064e21880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Fixes: 273ec51dd7ce ("net: ppp_generic - introduce net-namespace functionality v2") Reported-by: Baozeng Ding Signed-off-by: Guillaume Nault Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b Bug: 28979703 --- drivers/net/ppp/ppp_generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 794a4732936883..bb9fb11031348c 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2243,7 +2243,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan) pch->ppp = NULL; pch->chan = chan; - pch->chan_net = net; + pch->chan_net = get_net(net); chan->ppp = pch; init_ppp_file(&pch->file, CHANNEL); pch->file.hdrlen = chan->hdrlen; @@ -2340,6 +2340,8 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); + put_net(pch->chan_net); + pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); From 987acf61ca65f7daed2cdbc8d0bfecd406d15666 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Feb 2016 11:03:12 +0000 Subject: [PATCH 058/420] UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing (cherry pick from commit 23c8a812dc3c621009e4f0e5342aa4e2ede1ceaa) This fixes CVE-2016-0758. In the ASN.1 decoder, when the length field of an ASN.1 value is extracted, it isn't validated against the remaining amount of data before being added to the cursor. With a sufficiently large size indicated, the check: datalen - dp < 2 may then fail due to integer overflow. Fix this by checking the length indicated against the amount of remaining data in both places a definite length is determined. Whilst we're at it, make the following changes: (1) Check the maximum size of extended length does not exceed the capacity of the variable it's being stored in (len) rather than the type that variable is assumed to be (size_t). (2) Compare the EOC tag to the symbolic constant ASN1_EOC rather than the integer 0. (3) To reduce confusion, move the initialisation of len outside of: for (len = 0; n > 0; n--) { since it doesn't have anything to do with the loop counter n. Signed-off-by: David Howells Reviewed-by: Mimi Zohar Acked-by: David Woodhouse Acked-by: Peter Jones Change-Id: If760bc3b8ab0e59fefc24fa687514324348fb8e8 Bug: 29814470 --- lib/asn1_decoder.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index d60ce8a5365036..806c5b6b4b3a07 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -69,7 +69,7 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -91,10 +91,8 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -105,14 +103,18 @@ static int asn1_find_indefinite_length(const unsigned char *data, size_t datalen } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; From fe182ffd23b2db9ab321acb691212e7eec0383c5 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Fri, 20 Nov 2015 22:07:23 +0000 Subject: [PATCH 059/420] UPSTREAM: unix: avoid use-after-free in ep_remove_wait_queue (cherry picked from commit 7d267278a9ece963d77eefec61630223fce08c6c) Rainer Weikusat writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron Signed-off-by: David S. Miller Change-Id: Ia374ee061195088f8c777940baa75cedbe897f4e Bug: 29119002 --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 183 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 165 insertions(+), 19 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index a175ba4a7adbc6..cb4a7284e65612 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -63,6 +63,7 @@ struct unix_sock { #define UNIX_GC_CANDIDATE 0 #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; + wait_queue_t peer_wake; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8232118b3f82d6..b11562ed376c7c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -316,6 +316,118 @@ static struct sock *unix_find_socket_byinode(struct inode *i) return s; } +/* Support code for asymmetrically connected dgram sockets + * + * If a datagram socket is connected to a socket not itself connected + * to the first socket (eg, /dev/log), clients may only enqueue more + * messages if the present receive queue of the server socket is not + * "too large". This means there's a second writeability condition + * poll and sendmsg need to test. The dgram recv code will do a wake + * up on the peer_wait wait queue of a socket upon reception of a + * datagram which needs to be propagated to sleeping would-be writers + * since these might not have sent anything so far. This can't be + * accomplished via poll_wait because the lifetime of the server + * socket might be less than that of its clients if these break their + * association with it or if the server socket is closed while clients + * are still connected to it and there's no way to inform "a polling + * implementation" that it should let go of a certain wait queue + * + * In order to propagate a wake up, a wait_queue_t of the client + * socket is enqueued on the peer_wait queue of the server socket + * whose wake function does a wake_up on the ordinary client socket + * wait queue. This connection is established whenever a write (or + * poll for write) hit the flow control condition and broken when the + * association to the server socket is dissolved or after a wake up + * was relayed. + */ + +static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, + void *key) +{ + struct unix_sock *u; + wait_queue_head_t *u_sleep; + + u = container_of(q, struct unix_sock, peer_wake); + + __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, + q); + u->peer_wake.private = NULL; + + /* relaying can only happen while the wq still exists */ + u_sleep = sk_sleep(&u->sk); + if (u_sleep) + wake_up_interruptible_poll(u_sleep, key); + + return 0; +} + +static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) +{ + struct unix_sock *u, *u_other; + int rc; + + u = unix_sk(sk); + u_other = unix_sk(other); + rc = 0; + spin_lock(&u_other->peer_wait.lock); + + if (!u->peer_wake.private) { + u->peer_wake.private = other; + __add_wait_queue(&u_other->peer_wait, &u->peer_wake); + + rc = 1; + } + + spin_unlock(&u_other->peer_wait.lock); + return rc; +} + +static void unix_dgram_peer_wake_disconnect(struct sock *sk, + struct sock *other) +{ + struct unix_sock *u, *u_other; + + u = unix_sk(sk); + u_other = unix_sk(other); + spin_lock(&u_other->peer_wait.lock); + + if (u->peer_wake.private == other) { + __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); + u->peer_wake.private = NULL; + } + + spin_unlock(&u_other->peer_wait.lock); +} + +static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, + struct sock *other) +{ + unix_dgram_peer_wake_disconnect(sk, other); + wake_up_interruptible_poll(sk_sleep(sk), + POLLOUT | + POLLWRNORM | + POLLWRBAND); +} + +/* preconditions: + * - unix_peer(sk) == other + * - association is stable + */ +static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) +{ + int connected; + + connected = unix_dgram_peer_wake_connect(sk, other); + + if (unix_recvq_full(other)) + return 1; + + if (connected) + unix_dgram_peer_wake_disconnect(sk, other); + + return 0; +} + static inline int unix_writable(struct sock *sk) { return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; @@ -420,6 +532,8 @@ static void unix_release_sock(struct sock *sk, int embrion) skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } + + unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; } @@ -653,6 +767,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); + init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1020,6 +1135,8 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; + unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); + unix_state_double_unlock(sk, other); if (other != old_peer) @@ -1459,6 +1576,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct scm_cookie tmp_scm; int max_level; int data_len = 0; + int sk_locked; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1540,12 +1658,14 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out_free; } + sk_locked = 0; unix_state_lock(other); +restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; - if (sock_flag(other, SOCK_DEAD)) { + if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error @@ -1553,10 +1673,14 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, unix_state_unlock(other); sock_put(other); + if (!sk_locked) + unix_state_lock(sk); + err = 0; - unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; + unix_dgram_peer_wake_disconnect_wakeup(sk, other); + unix_state_unlock(sk); unix_dgram_disconnected(sk, other); @@ -1582,21 +1706,38 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out_unlock; } - if (unix_peer(other) != sk && unix_recvq_full(other)) { - if (!timeo) { - err = -EAGAIN; - goto out_unlock; + if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + if (timeo) { + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; } - timeo = unix_wait_for_peer(other, timeo); + if (!sk_locked) { + unix_state_unlock(other); + unix_state_double_lock(sk, other); + } - err = sock_intr_errno(timeo); - if (signal_pending(current)) - goto out_free; + if (unix_peer(sk) != other || + unix_dgram_peer_wake_me(sk, other)) { + err = -EAGAIN; + sk_locked = 1; + goto out_unlock; + } - goto restart; + if (!sk_locked) { + sk_locked = 1; + goto restart_locked; + } } + if (unlikely(sk_locked)) + unix_state_unlock(sk); + if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); @@ -1610,6 +1751,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, return len; out_unlock: + if (sk_locked) + unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); @@ -2255,14 +2398,16 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; writable = unix_writable(sk); - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); - if (unix_recvq_full(other)) - writable = 0; - } - sock_put(other); + if (writable) { + unix_state_lock(sk); + + other = unix_peer(sk); + if (other && unix_peer(other) != sk && + unix_recvq_full(other) && + unix_dgram_peer_wake_me(sk, other)) + writable = 0; + + unix_state_unlock(sk); } if (writable) From a388b1dbb0a1a45e4511ca3825937c4a8a53cb6b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Jun 2016 15:48:57 +0100 Subject: [PATCH 060/420] UPSTREAM: KEYS: potential uninitialized variable (cherry picked from commit 38327424b40bcebe2de92d07312c89360ac9229a) If __key_link_begin() failed then "edit" would be uninitialized. I've added a check to fix that. This allows a random user to crash the kernel, though it's quite difficult to achieve. There are three ways it can be done as the user would have to cause an error to occur in __key_link(): (1) Cause the kernel to run out of memory. In practice, this is difficult to achieve without ENOMEM cropping up elsewhere and aborting the attempt. (2) Revoke the destination keyring between the keyring ID being looked up and it being tested for revocation. In practice, this is difficult to time correctly because the KEYCTL_REJECT function can only be used from the request-key upcall process. Further, users can only make use of what's in /sbin/request-key.conf, though this does including a rejection debugging test - which means that the destination keyring has to be the caller's session keyring in practice. (3) Have just enough key quota available to create a key, a new session keyring for the upcall and a link in the session keyring, but not then sufficient quota to create a link in the nominated destination keyring so that it fails with EDQUOT. The bug can be triggered using option (3) above using something like the following: echo 80 >/proc/sys/kernel/keys/root_maxbytes keyctl request2 user debug:fred negate @t The above sets the quota to something much lower (80) to make the bug easier to trigger, but this is dependent on the system. Note also that the name of the keyring created contains a random number that may be between 1 and 10 characters in size, so may throw the test off by changing the amount of quota used. Assuming the failure occurs, something like the following will be seen: kfree_debugcheck: out of range ptr 6b6b6b6b6b6b6b68h ------------[ cut here ]------------ kernel BUG at ../mm/slab.c:2821! ... RIP: 0010:[] kfree_debugcheck+0x20/0x25 RSP: 0018:ffff8804014a7de8 EFLAGS: 00010092 RAX: 0000000000000034 RBX: 6b6b6b6b6b6b6b68 RCX: 0000000000000000 RDX: 0000000000040001 RSI: 00000000000000f6 RDI: 0000000000000300 RBP: ffff8804014a7df0 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8804014a7e68 R11: 0000000000000054 R12: 0000000000000202 R13: ffffffff81318a66 R14: 0000000000000000 R15: 0000000000000001 ... Call Trace: kfree+0xde/0x1bc assoc_array_cancel_edit+0x1f/0x36 __key_link_end+0x55/0x63 key_reject_and_link+0x124/0x155 keyctl_reject_key+0xb6/0xe0 keyctl_negate_key+0x10/0x12 SyS_keyctl+0x9f/0xe7 do_syscall_64+0x63/0x13a entry_SYSCALL64_slow_path+0x25/0x25 Fixes: f70e2e06196a ('KEYS: Do preallocation for __key_link()') Signed-off-by: Dan Carpenter Signed-off-by: David Howells cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: Ia9616cce142a616beea0ef20bde49129939d2d2d Bug: 29823941 --- security/keys/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/key.c b/security/keys/key.c index e17ba6aefdc082..f8bde20bed5da4 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -580,7 +580,7 @@ int key_reject_and_link(struct key *key, mutex_unlock(&key_construction_mutex); - if (keyring) + if (keyring && link_ret == 0) __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ From 7ed1e120e1cc31bea816709c25ebb80203ce9f1b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:49 +0100 Subject: [PATCH 061/420] UPSTREAM: netfilter: x_tables: validate e->target_offset early (cherry pick from commit bdf533de6968e9686df777dc178486f600c6e617) We should check that e->target_offset is sane before mark_source_chains gets called since it will fetch the target entry for loop detection. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: Ic2dbc31c9525d698e94d4d8875886acf3524abbd Bug: 29637687 --- net/ipv4/netfilter/arp_tables.c | 17 ++++++++--------- net/ipv4/netfilter/ip_tables.c | 17 ++++++++--------- net/ipv6/netfilter/ip6_tables.c | 17 ++++++++--------- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a12e24856cc7d6..c529acf2a298c7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -470,14 +470,12 @@ static int mark_source_chains(const struct xt_table_info *newinfo, return 1; } -static inline int check_entry(const struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e) { const struct xt_entry_target *t; - if (!arp_checkentry(&e->arp)) { - duprintf("arp_tables: arp check failed %p %s.\n", e, name); + if (!arp_checkentry(&e->arp)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -518,10 +516,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) struct xt_target *target; int ret; - ret = check_entry(e, name); - if (ret) - return ret; - t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); @@ -566,6 +560,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { @@ -580,6 +575,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1237,7 +1236,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct arpt_entry *)e, name); + ret = check_entry((struct arpt_entry *)e); if (ret) return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4636fd3ff49f04..29d87128f9d2ea 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -564,14 +564,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e) { const struct xt_entry_target *t; - if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, name); + if (!ip_checkentry(&e->ip)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -661,10 +659,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - j = 0; mtpar.net = net; mtpar.table = name; @@ -728,6 +722,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { @@ -742,6 +737,10 @@ check_entry_size_and_hooks(struct ipt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1502,7 +1501,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ipt_entry *)e, name); + ret = check_entry((struct ipt_entry *)e); if (ret) return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 415f1f027374af..de04f18bd90413 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -574,14 +574,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e) { const struct xt_entry_target *t; - if (!ip6_checkentry(&e->ipv6)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); + if (!ip6_checkentry(&e->ipv6)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -672,10 +670,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - j = 0; mtpar.net = net; mtpar.table = name; @@ -739,6 +733,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { @@ -753,6 +748,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1514,7 +1513,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ip6t_entry *)e, name); + ret = check_entry((struct ip6t_entry *)e); if (ret) return ret; From 2901499e25f2144cd15a5faeaab28e46d66f5bca Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:50 +0100 Subject: [PATCH 062/420] UPSTREAM: netfilter: x_tables: make sure e->next_offset covers remaining blob size (cherry pick from commit 6e94e0cfb0887e4013b3b930fa6ab1fe6bb6ba91) Otherwise this function may read data beyond the ruleset blob. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: I9d19ecf3e00a2d52817b35b9042623927895c005 Bug: 29637687 --- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c529acf2a298c7..738e62d548ce7b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -563,7 +563,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1223,7 +1224,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 29d87128f9d2ea..2c8fb724dde5da 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -725,7 +725,8 @@ check_entry_size_and_hooks(struct ipt_entry *e, int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1488,7 +1489,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index de04f18bd90413..bc0615b1cf63e3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -736,7 +736,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e, int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1500,7 +1501,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } From 9fcf139bb23d2fa526565e2c68517d0816609ea7 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Fri, 23 Oct 2015 15:13:42 -0700 Subject: [PATCH 063/420] CHROMIUM: android: binder: Fix potential scheduling-while-atomic (cherry picked from commit 166b45af97359159f9585a836c9849e725e31fd6) Commit f1e7f0a724f6 ("android: binder: Disable preemption while holding the global binder lock.") re-enabled preemption around most of the sites where calls to potentially sleeping functions were made, but missed __alloc_fd(), which can sleep if the fdtable needs to be resized. Re-enable preemption around __alloc_fd() as well as __fd_install() which can now sleep in upstream kernels as of commit 8a81252b774b ("fs/file.c: don't acquire files->file_lock in fd_install()"). BUG=chrome-os-partner:44012 TEST=Build and boot on Smaug. Change-Id: I9819c4b95876f697e75b1b84810b6c520d9c33ec Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/308582 Reviewed-by: Stephen Barber Reviewed-by: Riley Andrews Bug: 30141999 --- drivers/staging/android/binder.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 183748919ba6cc..679dbbd7091094 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -375,6 +375,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; + int ret; if (files == NULL) return -ESRCH; @@ -385,7 +386,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - return __alloc_fd(files, 0, rlim_cur, flags); + preempt_enable_no_resched(); + ret = __alloc_fd(files, 0, rlim_cur, flags); + preempt_disable(); + + return ret; } /* @@ -394,8 +399,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) + if (proc->files) { + preempt_enable_no_resched(); __fd_install(proc->files, fd, file); + preempt_disable(); + } } /* From 47a5d7f77233594efb1f18d14fbc22b98c41491b Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Mon, 18 Jul 2016 22:20:15 +0000 Subject: [PATCH 064/420] Revert "CHROMIUM: android: binder: Fix potential scheduling-while-atomic" This reverts commit 9fcf139bb23d2fa526565e2c68517d0816609ea7. Change-Id: Ie94f8330db4b78af0a5123408c22908e7fcf86ef --- drivers/staging/android/binder.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 679dbbd7091094..183748919ba6cc 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -375,7 +375,6 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; - int ret; if (files == NULL) return -ESRCH; @@ -386,11 +385,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - preempt_enable_no_resched(); - ret = __alloc_fd(files, 0, rlim_cur, flags); - preempt_disable(); - - return ret; + return __alloc_fd(files, 0, rlim_cur, flags); } /* @@ -399,11 +394,8 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) { - preempt_enable_no_resched(); + if (proc->files) __fd_install(proc->files, fd, file); - preempt_disable(); - } } /* From da758e20ec019454306aea1442341bbe42990127 Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Fri, 5 Jun 2015 18:59:29 -0700 Subject: [PATCH 065/420] cpuset: Add allow_attach hook for cpusets on android. This patch provides a allow_attach hook for cpusets, which resolves lots of the following logcat noise. W SchedPolicy: add_tid_to_cgroup failed to write '2816' (Permission denied); fd=29 W ActivityManager: Failed setting process group of 2816 to 0 W System.err: java.lang.IllegalArgumentException W System.err: at android.os.Process.setProcessGroup(Native Method) W System.err: at com.android.server.am.ActivityManagerService.applyOomAdjLocked(ActivityManagerService.java:18763) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19028) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19106) W System.err: at com.android.server.am.ActiveServices.serviceDoneExecutingLocked(ActiveServices.java:2015) W System.err: at com.android.server.am.ActiveServices.publishServiceLocked(ActiveServices.java:905) W System.err: at com.android.server.am.ActivityManagerService.publishService(ActivityManagerService.java:16065) W System.err: at android.app.ActivityManagerNative.onTransact(ActivityManagerNative.java:1007) W System.err: at com.android.server.am.ActivityManagerService.onTransact(ActivityManagerService.java:2493) W System.err: at android.os.Binder.execTransact(Binder.java:453) Change-Id: Ic1b61b2bbb7ce74c9e9422b5e22ee9078251de21 [Ported to 4.4, added commit message] Signed-off-by: John Stultz --- kernel/cpuset.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087bc5..dadf32727da305 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2046,12 +2046,30 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +static int cpuset_allow_attach(struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); + + if ((current != task) && !capable(CAP_SYS_ADMIN) && + cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) + return -EACCES; + } + + return 0; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, From 03d7df802473b5b7c783ed6ad6d0144ca94f1c42 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Jul 2016 11:10:39 -0700 Subject: [PATCH 066/420] cpuset: Fix allow_attach hook for cpusets on android. Change-Id: Idc7e8c998d9b0ca76eba2aad064f69deac7aa4ee Signed-off-by: Dmitry Shmidt --- kernel/cpuset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dadf32727da305..cc319d0f04cce1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2046,13 +2046,13 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } -static int cpuset_allow_attach(struct cgroup_taskset *tset) +static int cpuset_allow_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { const struct cred *cred = current_cred(), *tcred; struct task_struct *task; - struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { tcred = __task_cred(task); if ((current != task) && !capable(CAP_SYS_ADMIN) && From 970a05e8c52ef8802a73077fba1bcc252924246b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 5 Jul 2016 22:12:36 -0700 Subject: [PATCH 067/420] UPSTREAM: ppp: defer netns reference release for ppp channel (cherry pick from commit 205e1e255c479f3fd77446415706463b282f94e4) Matt reported that we have a NULL pointer dereference in ppp_pernet() from ppp_connect_channel(), i.e. pch->chan_net is NULL. This is due to that a parallel ppp_unregister_channel() could happen while we are in ppp_connect_channel(), during which pch->chan_net set to NULL. Since we need a reference to net per channel, it makes sense to sync the refcnt with the life time of the channel, therefore we should release this reference when we destroy it. Fixes: 1f461dcdd296 ("ppp: take reference on channels netns") Reported-by: Matt Bennett Cc: Paul Mackerras Cc: linux-ppp@vger.kernel.org Cc: Guillaume Nault Cc: Cyrill Gorcunov Signed-off-by: Cong Wang Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Fixes: Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b ("UPSTREAM: ppp: take reference on channels netns") Signed-off-by: Amit Pundir Change-Id: I24d0bb6f349ab3829f63cfe935ed97b6913a3508 --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index bb9fb11031348c..75e55d7f52acec 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2340,8 +2340,6 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); - put_net(pch->chan_net); - pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); @@ -2958,6 +2956,9 @@ ppp_disconnect_channel(struct channel *pch) */ static void ppp_destroy_channel(struct channel *pch) { + put_net(pch->chan_net); + pch->chan_net = NULL; + atomic_dec(&channel_count); if (!pch->file.dead) { From 39f7ca231968cc3e544a0ff83fec607038e3781b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:05 +0200 Subject: [PATCH 068/420] UPSTREAM: proc: prevent stacking filesystems on top (cherry picked from commit e54ad7f1ee263ffa5a2de9c609d58dfa27b21cd9) This prevents stacking filesystems (ecryptfs and overlayfs) from using procfs as lower filesystem. There is too much magic going on inside procfs, and there is no good reason to stack stuff on top of procfs. (For example, procfs does access checks in VFS open handlers, and ecryptfs by design calls open handlers from a kernel thread that doesn't drop privileges or so.) Signed-off-by: Jann Horn Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: Ib050ef9dc10e623589d22e3a9e6aee9ee4f0cd5d Bug: 29444228 --- fs/proc/root.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/proc/root.c b/fs/proc/root.c index 094e44d4a6be6a..ed47f4cda85df0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -124,6 +124,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); From 8245aa0d71536ecedeb04a4b599a18769a66ead9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:06 +0200 Subject: [PATCH 069/420] UPSTREAM: ecryptfs: forbid opening files without mmap handler (cherry picked from commit 2f36db71009304b3f0b95afacd8eba1f9f046b87) This prevents users from triggering a stack overflow through a recursive invocation of pagefault handling that involves mapping procfs files into virtual memory. Signed-off-by: Jann Horn Acked-by: Tyler Hicks Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: I0be77c7f8bd3046bc34cd87ef577529792d479bc Bug: 29444228 --- fs/ecryptfs/kthread.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index f1ea610362c6c1..9b661a4ccee739 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -147,7 +148,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto out; + goto have_file; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -165,8 +166,16 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) + if (IS_ERR(*lower_file)) { rc = PTR_ERR(*lower_file); + goto out; + } +have_file: + if ((*lower_file)->f_op->mmap == NULL) { + fput(*lower_file); + *lower_file = NULL; + rc = -EMEDIUMTYPE; + } out: return rc; } From fd92a361cb4be3bff7c4bc269b216dbfc6d0d6ab Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:07 +0200 Subject: [PATCH 070/420] UPSTREAM: sched: panic on corrupted stack end (cherry pick from commit 29d6455178a09e1dc340380c582b13356227e8df) Until now, hitting this BUG_ON caused a recursive oops (because oops handling involves do_exit(), which calls into the scheduler, which in turn raises an oops), which caused stuff below the stack to be overwritten until a panic happened (e.g. via an oops in interrupt context, caused by the overwritten CPU index in the thread_info). Just panic directly. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds Change-Id: Ia3acb3f747f7a58ec2d071644433b0591925969f Bug: 29444228 --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1a898e6d5d73b..be1bcd6b93251f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2670,7 +2670,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif /* * Test if we are atomic. Since do_exit() needs to call into From f5d7d336e7fda05b8510c8a09b1caba830ebc77d Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Mon, 1 Aug 2016 19:16:52 -0400 Subject: [PATCH 071/420] usb: gadget: f_accessory: remove duplicate endpoint alloc usb_ep_autoconfig is called twice for allocating bulk out endpoint. Removed the unwanted call. Fixes Issue: 67180 Change-Id: Ibced4127bb171076da0eeca3aed07eb1d8739762 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 96e5576761e617..7ae624e72669bb 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -531,15 +531,6 @@ static int create_bulk_endpoints(struct acc_dev *dev, ep->driver_data = dev; /* claim the endpoint */ dev->ep_out = ep; - ep = usb_ep_autoconfig(cdev->gadget, out_desc); - if (!ep) { - DBG(cdev, "usb_ep_autoconfig for ep_out failed\n"); - return -ENODEV; - } - DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name); - ep->driver_data = dev; /* claim the endpoint */ - dev->ep_out = ep; - /* now allocate requests for our endpoints */ for (i = 0; i < TX_REQ_MAX; i++) { req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE); From f1a4911fce30ced8e34e630f4e40c00d637e5163 Mon Sep 17 00:00:00 2001 From: Ruchi Kandoi Date: Tue, 10 May 2016 10:08:17 -0700 Subject: [PATCH 072/420] power: Refactors the code which prints suspend time. Time for which device suspended is no longer tied to the persistent clocks. This is specially an issue for architecture which do not implement read_persistent_clocks() Bug: 22928771 Change-Id: Ibbaec51819ddad8d38a169237390077b4307022d Signed-off-by: Ruchi Kandoi --- kernel/power/Makefile | 1 - kernel/power/suspend_time.c | 111 ----------------------------------- kernel/power/wakeup_reason.c | 63 ++++++++++++++++++++ 3 files changed, 63 insertions(+), 112 deletions(-) delete mode 100644 kernel/power/suspend_time.c diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 74c713ba61b070..299f8a4d42f7d5 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -11,7 +11,6 @@ obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o -obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c deleted file mode 100644 index d2a65da9f22c15..00000000000000 --- a/kernel/power/suspend_time.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * debugfs file to track time spent in suspend - * - * Copyright (c) 2011, Google, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include -#include -#include -#include -#include -#include -#include - -static struct timespec suspend_time_before; -static unsigned int time_in_suspend_bins[32]; - -#ifdef CONFIG_DEBUG_FS -static int suspend_time_debug_show(struct seq_file *s, void *data) -{ - int bin; - seq_printf(s, "time (secs) count\n"); - seq_printf(s, "------------------\n"); - for (bin = 0; bin < 32; bin++) { - if (time_in_suspend_bins[bin] == 0) - continue; - seq_printf(s, "%4d - %4d %4u\n", - bin ? 1 << (bin - 1) : 0, 1 << bin, - time_in_suspend_bins[bin]); - } - return 0; -} - -static int suspend_time_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, suspend_time_debug_show, NULL); -} - -static const struct file_operations suspend_time_debug_fops = { - .open = suspend_time_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init suspend_time_debug_init(void) -{ - struct dentry *d; - - d = debugfs_create_file("suspend_time", 0755, NULL, NULL, - &suspend_time_debug_fops); - if (!d) { - pr_err("Failed to create suspend_time debug file\n"); - return -ENOMEM; - } - - return 0; -} - -late_initcall(suspend_time_debug_init); -#endif - -static int suspend_time_syscore_suspend(void) -{ - read_persistent_clock(&suspend_time_before); - - return 0; -} - -static void suspend_time_syscore_resume(void) -{ - struct timespec after; - - read_persistent_clock(&after); - - after = timespec_sub(after, suspend_time_before); - - time_in_suspend_bins[fls(after.tv_sec)]++; - - pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec, - after.tv_nsec / NSEC_PER_MSEC); -} - -static struct syscore_ops suspend_time_syscore_ops = { - .suspend = suspend_time_syscore_suspend, - .resume = suspend_time_syscore_resume, -}; - -static int suspend_time_syscore_init(void) -{ - register_syscore_ops(&suspend_time_syscore_ops); - - return 0; -} - -static void suspend_time_syscore_exit(void) -{ - unregister_syscore_ops(&suspend_time_syscore_ops); -} -module_init(suspend_time_syscore_init); -module_exit(suspend_time_syscore_exit); diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c index 252611fad2fee2..8f825b9adacba4 100644 --- a/kernel/power/wakeup_reason.c +++ b/kernel/power/wakeup_reason.c @@ -26,6 +26,7 @@ #include #include #include +#include #define MAX_WAKEUP_REASON_IRQS 32 @@ -40,6 +41,9 @@ static ktime_t last_monotime; /* monotonic time before last suspend */ static ktime_t curr_monotime; /* monotonic time after last suspend */ static ktime_t last_stime; /* monotonic boottime offset before last suspend */ static ktime_t curr_stime; /* monotonic boottime offset after last suspend */ +#if IS_ENABLED(CONFIG_SUSPEND_TIME) +static unsigned int time_in_suspend_bins[32]; +#endif static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -168,6 +172,11 @@ void log_suspend_abort_reason(const char *fmt, ...) static int wakeup_reason_pm_event(struct notifier_block *notifier, unsigned long pm_event, void *unused) { +#if IS_ENABLED(CONFIG_SUSPEND_TIME) + ktime_t temp; + struct timespec suspend_time; +#endif + switch (pm_event) { case PM_SUSPEND_PREPARE: spin_lock(&resume_reason_lock); @@ -184,6 +193,15 @@ static int wakeup_reason_pm_event(struct notifier_block *notifier, curr_monotime = ktime_get(); /* monotonic time since boot including the time spent in suspend */ curr_stime = ktime_get_boottime(); + +#if IS_ENABLED(CONFIG_SUSPEND_TIME) + temp = ktime_sub(ktime_sub(curr_stime, last_stime), + ktime_sub(curr_monotime, last_monotime)); + suspend_time = ktime_to_timespec(temp); + time_in_suspend_bins[fls(suspend_time.tv_sec)]++; + pr_info("Suspended for %lu.%03lu seconds\n", suspend_time.tv_sec, + suspend_time.tv_nsec / NSEC_PER_MSEC); +#endif break; default: break; @@ -195,6 +213,51 @@ static struct notifier_block wakeup_reason_pm_notifier_block = { .notifier_call = wakeup_reason_pm_event, }; +#if IS_ENABLED(CONFIG_DEBUG_FS) && IS_ENABLED(CONFIG_SUSPEND_TIME) +static int suspend_time_debug_show(struct seq_file *s, void *data) +{ + int bin; + seq_printf(s, "time (secs) count\n"); + seq_printf(s, "------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (time_in_suspend_bins[bin] == 0) + continue; + seq_printf(s, "%4d - %4d %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + time_in_suspend_bins[bin]); + } + return 0; +} + +static int suspend_time_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_time_debug_show, NULL); +} + +static const struct file_operations suspend_time_debug_fops = { + .open = suspend_time_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init suspend_time_debug_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("suspend_time", 0755, NULL, NULL, + &suspend_time_debug_fops); + if (!d) { + pr_err("Failed to create suspend_time debug file\n"); + return -ENOMEM; + } + + return 0; +} + +late_initcall(suspend_time_debug_init); +#endif + /* Initializes the sysfs parameter * registers the pm_event notifier */ From b52f5c7a85426c3055c50a7728a0d0617db94168 Mon Sep 17 00:00:00 2001 From: James Carr Date: Fri, 29 Jul 2016 19:02:16 -0700 Subject: [PATCH 073/420] Implement memory_state_time, used by qcom,cpubw New driver memory_state_time tracks time spent in different DDR frequency and bandwidth states. Memory drivers such as qcom,cpubw can post updated state to the driver after registering a callback. Processed by a workqueue Bandwidth buckets are read in from device tree in the relevant qualcomm section, can be defined in any quantity and spacing. The data is exposed at /sys/kernel/memory_state_time, able to be read by the Android framework. Functionality is behind a config option CONFIG_MEMORY_STATE_TIME Change-Id: I4fee165571cb975fb9eacbc9aada5e6d7dd748f0 Signed-off-by: James Carr --- .../bindings/misc/memory-state-time.txt | 8 + android/configs/android-recommended.cfg | 1 + drivers/misc/Kconfig | 6 + drivers/misc/Makefile | 1 + drivers/misc/memory_state_time.c | 454 ++++++++++++++++++ include/linux/memory-state-time.h | 42 ++ 6 files changed, 512 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/memory-state-time.txt create mode 100644 drivers/misc/memory_state_time.c create mode 100644 include/linux/memory-state-time.h diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt new file mode 100644 index 00000000000000..c99a506c030d9b --- /dev/null +++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt @@ -0,0 +1,8 @@ +Memory bandwidth and frequency state tracking + +Required properties: +- compatible : should be: + "memory-state-time" +- freq-tbl: Should contain entries with each frequency in Hz. +- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps. + Must match the framework power_profile.xml for the device. diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index f4184955ad57ef..e35a140d2f3d54 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -126,6 +126,7 @@ CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y CONFIG_UID_STAT=y +CONFIG_MEMORY_STATE_TIME=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 114e0b860dc0ab..119fe4504dbc58 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -525,6 +525,12 @@ config UID_CPUTIME help Per UID based cpu time statistics exported to /proc/uid_cputime +config MEMORY_STATE_TIME + tristate "Memory freq/bandwidth time statistics" + depends on PROFILING + help + Memory time statistics exported to /sys/kernel/memory_state_time + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index d6911153de255d..42128e42725270 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -58,3 +58,4 @@ obj-$(CONFIG_ECHO) += echo/ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o obj-$(CONFIG_CXL_BASE) += cxl/ obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o +obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c new file mode 100644 index 00000000000000..34c797a06a31aa --- /dev/null +++ b/drivers/misc/memory_state_time.c @@ -0,0 +1,454 @@ +/* drivers/misc/memory_state_time.c + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define KERNEL_ATTR_RW(_name) \ +static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +#define FREQ_HASH_BITS 4 +DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS); + +static DEFINE_MUTEX(mem_lock); + +#define TAG "memory_state_time" +#define BW_NODE "/soc/memory-state-time" +#define FREQ_TBL "freq-tbl" +#define BW_TBL "bw-buckets" +#define NUM_SOURCES "num-sources" + +#define LOWEST_FREQ 2 + +static int curr_bw; +static int curr_freq; +static u32 *bw_buckets; +static u32 *freq_buckets; +static int num_freqs; +static int num_buckets; +static int registered_bw_sources; +static u64 last_update; +static bool init_success; +static struct workqueue_struct *memory_wq; +static u32 num_sources = 10; +static int *bandwidths; + +struct freq_entry { + int freq; + u64 *buckets; /* Bandwidth buckets. */ + struct hlist_node hash; +}; + +struct queue_container { + struct work_struct update_state; + int value; + u64 time_now; + int id; + struct mutex *lock; +}; + +static int find_bucket(int bw) +{ + int i; + + if (bw_buckets != NULL) { + for (i = 0; i < num_buckets; i++) { + if (bw_buckets[i] > bw) { + pr_debug("Found bucket %d for bandwidth %d\n", + i, bw); + return i; + } + } + return num_buckets - 1; + } + return 0; +} + +static u64 get_time_diff(u64 time_now) +{ + u64 ms; + + ms = time_now - last_update; + last_update = time_now; + return ms; +} + +static ssize_t show_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, j; + int len = 0; + struct freq_entry *freq_entry; + + for (i = 0; i < num_freqs; i++) { + hash_for_each_possible(freq_hash_table, freq_entry, hash, + freq_buckets[i]) { + if (freq_entry->freq == freq_buckets[i]) { + len += scnprintf(buf + len, PAGE_SIZE - len, + "%d ", freq_buckets[i]); + if (len >= PAGE_SIZE) + break; + for (j = 0; j < num_buckets; j++) { + len += scnprintf(buf + len, + PAGE_SIZE - len, + "%llu ", + freq_entry->buckets[j]); + } + len += scnprintf(buf + len, PAGE_SIZE - len, + "\n"); + } + } + } + pr_debug("Current Time: %llu\n", ktime_get_boot_ns()); + return len; +} +KERNEL_ATTR_RO(show_stat); + +static void update_table(u64 time_now) +{ + struct freq_entry *freq_entry; + + pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq); + hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) { + if (curr_freq == freq_entry->freq) { + freq_entry->buckets[find_bucket(curr_bw)] + += get_time_diff(time_now); + break; + } + } +} + +static bool freq_exists(int freq) +{ + int i; + + for (i = 0; i < num_freqs; i++) { + if (freq == freq_buckets[i]) + return true; + } + return false; +} + +static int calculate_total_bw(int bw, int index) +{ + int i; + int total_bw = 0; + + pr_debug("memory_state_time New bw %d for id %d\n", bw, index); + bandwidths[index] = bw; + for (i = 0; i < registered_bw_sources; i++) + total_bw += bandwidths[i]; + return total_bw; +} + +static void freq_update_do_work(struct work_struct *work) +{ + struct queue_container *freq_state_update + = container_of(work, struct queue_container, + update_state); + if (freq_state_update) { + mutex_lock(&mem_lock); + update_table(freq_state_update->time_now); + curr_freq = freq_state_update->value; + mutex_unlock(&mem_lock); + kfree(freq_state_update); + } +} + +static void bw_update_do_work(struct work_struct *work) +{ + struct queue_container *bw_state_update + = container_of(work, struct queue_container, + update_state); + if (bw_state_update) { + mutex_lock(&mem_lock); + update_table(bw_state_update->time_now); + curr_bw = calculate_total_bw(bw_state_update->value, + bw_state_update->id); + mutex_unlock(&mem_lock); + kfree(bw_state_update); + } +} + +static void memory_state_freq_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (freq_exists(value) && init_success) { + struct queue_container *freq_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!freq_container) + return; + INIT_WORK(&freq_container->update_state, + freq_update_do_work); + freq_container->time_now = ktime_get_boot_ns(); + freq_container->value = value; + pr_debug("Scheduling freq update in work queue\n"); + queue_work(memory_wq, &freq_container->update_state); + } else { + pr_debug("Freq does not exist.\n"); + } + } +} + +static void memory_state_bw_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (init_success) { + struct queue_container *bw_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!bw_container) + return; + INIT_WORK(&bw_container->update_state, + bw_update_do_work); + bw_container->time_now = ktime_get_boot_ns(); + bw_container->value = value; + bw_container->id = ub->id; + pr_debug("Scheduling bandwidth update in work queue\n"); + queue_work(memory_wq, &bw_container->update_state); + } + } +} + +struct memory_state_update_block *memory_state_register_frequency_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating frequency source\n"); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_freq_update; + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_frequency_source); + +struct memory_state_update_block *memory_state_register_bandwidth_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating bandwidth source %d\n", + registered_bw_sources); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_bw_update; + if (registered_bw_sources < num_sources) { + block->id = registered_bw_sources++; + } else { + pr_err("Unable to allocate source; max number reached\n"); + kfree(block); + return NULL; + } + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source); + +/* Buckets are designated by their maximum. + * Returns the buckets decided by the capability of the device. + */ +static int get_bw_buckets(struct device *dev) +{ + int ret, lenb; + struct device_node *node = dev->of_node; + + of_property_read_u32(node, NUM_SOURCES, &num_sources); + if (of_find_property(node, BW_TBL, &lenb)) { + bandwidths = devm_kzalloc(dev, + sizeof(*bandwidths) * num_sources, GFP_KERNEL); + if (!bandwidths) + return -ENOMEM; + lenb /= sizeof(*bw_buckets); + bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets), + GFP_KERNEL); + if (!bw_buckets) { + devm_kfree(dev, bandwidths); + return -ENOMEM; + } + ret = of_property_read_u32_array(node, BW_TBL, bw_buckets, + lenb); + if (ret < 0) { + devm_kfree(dev, bandwidths); + devm_kfree(dev, bw_buckets); + pr_err("Unable to read bandwidth table from device tree.\n"); + return ret; + } + } + curr_bw = 0; + num_buckets = lenb; + return 0; +} + +/* Adds struct freq_entry nodes to the hashtable for each compatible frequency. + * Returns the supported number of frequencies. + */ +static int freq_buckets_init(struct device *dev) +{ + struct freq_entry *freq_entry; + int i; + int ret, lenf; + struct device_node *node = dev->of_node; + + if (of_find_property(node, FREQ_TBL, &lenf)) { + lenf /= sizeof(*freq_buckets); + freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets), + GFP_KERNEL); + if (!freq_buckets) + return -ENOMEM; + pr_debug("freqs found len %d\n", lenf); + ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets, + lenf); + if (ret < 0) { + devm_kfree(dev, freq_buckets); + pr_err("Unable to read frequency table from device tree.\n"); + return ret; + } + pr_debug("ret freq %d\n", ret); + } + num_freqs = lenf; + curr_freq = freq_buckets[LOWEST_FREQ]; + + for (i = 0; i < num_freqs; i++) { + freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry), + GFP_KERNEL); + if (!freq_entry) + return -ENOMEM; + freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets, + GFP_KERNEL); + if (!freq_entry->buckets) { + devm_kfree(dev, freq_entry); + return -ENOMEM; + } + pr_debug("memory_state_time Adding freq to ht %d\n", + freq_buckets[i]); + freq_entry->freq = freq_buckets[i]; + hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]); + } + return 0; +} + +struct kobject *memory_kobj; +EXPORT_SYMBOL_GPL(memory_kobj); + +static struct attribute *memory_attrs[] = { + &show_stat_attr.attr, + NULL +}; + +static struct attribute_group memory_attr_group = { + .attrs = memory_attrs, +}; + +static int memory_state_time_probe(struct platform_device *pdev) +{ + int error; + + error = get_bw_buckets(&pdev->dev); + if (error) + return error; + error = freq_buckets_init(&pdev->dev); + if (error) + return error; + last_update = ktime_get_boot_ns(); + init_success = true; + + pr_debug("memory_state_time initialized with num_freqs %d\n", + num_freqs); + return 0; +} + +static const struct of_device_id match_table[] = { + { .compatible = "memory-state-time" }, + {} +}; + +static struct platform_driver memory_state_time_driver = { + .probe = memory_state_time_probe, + .driver = { + .name = "memory-state-time", + .of_match_table = match_table, + .owner = THIS_MODULE, + }, +}; + +static int __init memory_state_time_init(void) +{ + int error; + + hash_init(freq_hash_table); + memory_wq = create_singlethread_workqueue("memory_wq"); + if (!memory_wq) { + pr_err("Unable to create workqueue.\n"); + return -EINVAL; + } + /* + * Create sys/kernel directory for memory_state_time. + */ + memory_kobj = kobject_create_and_add(TAG, kernel_kobj); + if (!memory_kobj) { + pr_err("Unable to allocate memory_kobj for sysfs directory.\n"); + error = -ENOMEM; + goto wq; + } + error = sysfs_create_group(memory_kobj, &memory_attr_group); + if (error) { + pr_err("Unable to create sysfs folder.\n"); + goto kobj; + } + + error = platform_driver_register(&memory_state_time_driver); + if (error) { + pr_err("Unable to register memory_state_time platform driver.\n"); + goto group; + } + return 0; + +group: sysfs_remove_group(memory_kobj, &memory_attr_group); +kobj: kobject_put(memory_kobj); +wq: destroy_workqueue(memory_wq); + return error; +} +module_init(memory_state_time_init); diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h new file mode 100644 index 00000000000000..d2212b02786627 --- /dev/null +++ b/include/linux/memory-state-time.h @@ -0,0 +1,42 @@ +/* include/linux/memory-state-time.h + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include + +#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE) + +struct memory_state_update_block; + +typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub, + int value); + +/* This struct is populated when you pass it to a memory_state_register* + * function. The update_call function is used for an update and defined in the + * typedef memory_state_update_fn_t + */ +struct memory_state_update_block { + memory_state_update_fn_t update_call; + int id; +}; + +/* Register a frequency struct memory_state_update_block to provide updates to + * memory_state_time about frequency changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_frequency_source(void); + +/* Register a bandwidth struct memory_state_update_block to provide updates to + * memory_state_time about bandwidth changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_bandwidth_source(void); From dbfc059e25003a8952ac2350ce5c4553660e9ec3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 14 Mar 2016 09:56:35 -0300 Subject: [PATCH 074/420] UPSTREAM: net: Fix use after free in the recvmmsg exit path (cherry picked from commit 34b88a68f26a75e4fded796f1a49c40f82234b7d) The syzkaller fuzzer hit the following use-after-free: Call Trace: [] __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:295 [] __sys_recvmmsg+0x6fa/0x7f0 net/socket.c:2261 [< inline >] SYSC_recvmmsg net/socket.c:2281 [] SyS_recvmmsg+0x16f/0x180 net/socket.c:2270 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 And, as Dmitry rightly assessed, that is because we can drop the reference and then touch it when the underlying recvmsg calls return some packets and then hit an error, which will make recvmmsg to set sock->sk->sk_err, oops, fix it. Reported-and-Tested-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Eric Dumazet Cc: Kostya Serebryany Cc: Sasha Levin Fixes: a2e2725541fa ("net: Introduce recvmmsg socket syscall") http://lkml.kernel.org/r/20160122211644.GC2470@redhat.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Change-Id: I2adb0faf595b7b634d9b739dfdd1a47109e20ecb Bug: 30515201 --- net/socket.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/net/socket.c b/net/socket.c index 1de8b265e2bc64..13e3a3668cca9d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2421,31 +2421,31 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, break; } -out_put: - fput_light(sock->file, fput_needed); - if (err == 0) - return datagrams; + goto out_put; - if (datagrams != 0) { + if (datagrams == 0) { + datagrams = err; + goto out_put; + } + + /* + * We may return less entries than requested (vlen) if the + * sock is non block and there aren't enough datagrams... + */ + if (err != -EAGAIN) { /* - * We may return less entries than requested (vlen) if the - * sock is non block and there aren't enough datagrams... + * ... or if recvmsg returns an error after we + * received some datagrams, where we record the + * error to return on the next call or if the + * app asks about it using getsockopt(SO_ERROR). */ - if (err != -EAGAIN) { - /* - * ... or if recvmsg returns an error after we - * received some datagrams, where we record the - * error to return on the next call or if the - * app asks about it using getsockopt(SO_ERROR). - */ - sock->sk->sk_err = -err; - } - - return datagrams; + sock->sk->sk_err = -err; } +out_put: + fput_light(sock->file, fput_needed); - return err; + return datagrams; } SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, From e5f187c4174e5999faf66013e97695fd7589a7d2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Nov 2014 10:01:16 +0100 Subject: [PATCH 075/420] kernel: Provide READ_ONCE and ASSIGN_ONCE [ Upstream commit 230fa253df6352af12ad0a16128760b5cb3f92df ] ACCESS_ONCE does not work reliably on non-scalar types. For example gcc 4.6 and 4.7 might remove the volatile tag for such accesses during the SRA (scalar replacement of aggregates) step https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145) Let's provide READ_ONCE/ASSIGN_ONCE that will do all accesses via scalar types as suggested by Linus Torvalds. Accesses larger than the machines word size cannot be guaranteed to be atomic. These macros will use memcpy and emit a build warning. Signed-off-by: Christian Borntraeger Signed-off-by: Sasha Levin --- include/linux/compiler.h | 74 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d5ad7b1118fc10..a1c81f80978ee4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -186,6 +186,80 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) #endif +#include + +static __always_inline void data_access_exceeds_word_size(void) +#ifdef __compiletime_warning +__compiletime_warning("data access exceeds word size and won't be atomic") +#endif +; + +static __always_inline void data_access_exceeds_word_size(void) +{ +} + +static __always_inline void __read_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8 *)res = *(volatile __u8 *)p; break; + case 2: *(__u16 *)res = *(volatile __u16 *)p; break; + case 4: *(__u32 *)res = *(volatile __u32 *)p; break; +#ifdef CONFIG_64BIT + case 8: *(__u64 *)res = *(volatile __u64 *)p; break; +#endif + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + data_access_exceeds_word_size(); + barrier(); + } +} + +static __always_inline void __assign_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8 *)p = *(__u8 *)res; break; + case 2: *(volatile __u16 *)p = *(__u16 *)res; break; + case 4: *(volatile __u32 *)p = *(__u32 *)res; break; +#ifdef CONFIG_64BIT + case 8: *(volatile __u64 *)p = *(__u64 *)res; break; +#endif + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + data_access_exceeds_word_size(); + barrier(); + } +} + +/* + * Prevent the compiler from merging or refetching reads or writes. The + * compiler is also forbidden from reordering successive instances of + * READ_ONCE, ASSIGN_ONCE and ACCESS_ONCE (see below), but only when the + * compiler is aware of some particular ordering. One way to make the + * compiler aware of ordering is to put the two invocations of READ_ONCE, + * ASSIGN_ONCE or ACCESS_ONCE() in different C statements. + * + * In contrast to ACCESS_ONCE these two macros will also work on aggregate + * data types like structs or unions. If the size of the accessed data + * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) + * READ_ONCE() and ASSIGN_ONCE() will fall back to memcpy and print a + * compile-time warning. + * + * Their two major use cases are: (1) Mediating communication between + * process-level code and irq/NMI handlers, all running on the same CPU, + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * mutilate accesses that either do not require ordering or that interact + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ + +#define READ_ONCE(x) \ + ({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; }) + +#define ASSIGN_ONCE(val, x) \ + ({ typeof(x) __val; __val = val; __assign_once_size(&x, &__val, sizeof(__val)); __val; }) + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ From 7389fbaec751b85bef53dc1a14dff05db993f7be Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 13 Jan 2015 10:46:42 +0100 Subject: [PATCH 076/420] kernel: Change ASSIGN_ONCE(val, x) to WRITE_ONCE(x, val) [ Upstream commit 43239cbe79fc369f5d2160bd7f69e28b5c50a58c ] Feedback has shown that WRITE_ONCE(x, val) is easier to use than ASSIGN_ONCE(val,x). There are no in-tree users yet, so lets change it for 3.19. Signed-off-by: Christian Borntraeger Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Acked-by: Paul E. McKenney Signed-off-by: Sasha Levin --- include/linux/compiler.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index a1c81f80978ee4..33063f872ee3cd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -215,7 +215,7 @@ static __always_inline void __read_once_size(volatile void *p, void *res, int si } } -static __always_inline void __assign_once_size(volatile void *p, void *res, int size) +static __always_inline void __write_once_size(volatile void *p, void *res, int size) { switch (size) { case 1: *(volatile __u8 *)p = *(__u8 *)res; break; @@ -235,15 +235,15 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of - * READ_ONCE, ASSIGN_ONCE and ACCESS_ONCE (see below), but only when the + * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the * compiler is aware of some particular ordering. One way to make the * compiler aware of ordering is to put the two invocations of READ_ONCE, - * ASSIGN_ONCE or ACCESS_ONCE() in different C statements. + * WRITE_ONCE or ACCESS_ONCE() in different C statements. * * In contrast to ACCESS_ONCE these two macros will also work on aggregate * data types like structs or unions. If the size of the accessed data * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and ASSIGN_ONCE() will fall back to memcpy and print a + * READ_ONCE() and WRITE_ONCE() will fall back to memcpy and print a * compile-time warning. * * Their two major use cases are: (1) Mediating communication between @@ -257,8 +257,8 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int #define READ_ONCE(x) \ ({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; }) -#define ASSIGN_ONCE(val, x) \ - ({ typeof(x) __val; __val = val; __assign_once_size(&x, &__val, sizeof(__val)); __val; }) +#define WRITE_ONCE(x, val) \ + ({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; }) #endif /* __KERNEL__ */ From 60dfb2c156afd7022f4a1b8ad6b87075d45d2504 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Feb 2015 15:46:31 -0800 Subject: [PATCH 077/420] kernel: make READ_ONCE() valid on const arguments [ Upstream commit dd36929720f40f17685e841ae0d4c581c165ea60 ] The use of READ_ONCE() causes lots of warnings witht he pending paravirt spinlock fixes, because those ends up having passing a member to a 'const' structure to READ_ONCE(). There should certainly be nothing wrong with using READ_ONCE() with a const source, but the helper function __read_once_size() would cause warnings because it would drop the 'const' qualifier, but also because the destination would be marked 'const' too due to the use of 'typeof'. Use a union of types in READ_ONCE() to avoid this issue. Also make sure to use parenthesis around the macro arguments to avoid possible operator precedence issues. Tested-by: Ingo Molnar Cc: Christian Borntraeger Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/compiler.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 33063f872ee3cd..000c5f90f08c82 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -198,7 +198,7 @@ static __always_inline void data_access_exceeds_word_size(void) { } -static __always_inline void __read_once_size(volatile void *p, void *res, int size) +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) { switch (size) { case 1: *(__u8 *)res = *(volatile __u8 *)p; break; @@ -255,10 +255,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s */ #define READ_ONCE(x) \ - ({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; }) + ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; }) #define WRITE_ONCE(x, val) \ - ({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; }) + ({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; }) #endif /* __KERNEL__ */ From b4f0509960604b9f789596f349049a4675f62baa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Feb 2015 14:36:57 -0800 Subject: [PATCH 078/420] cpumask, nodemask: implement cpumask/nodemask_pr_args() printf family of functions can now format bitmaps using '%*pb[l]' and all cpumask and nodemask formatting will be converted to use it. To ease printing these masks with '%*pb[l]' which require two params - the number of bits and the actual bitmap, this patch implement cpumask_pr_args() and nodemask_pr_args() which can be used to provide arguments for '%*pb[l]' Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: "David S. Miller" Cc: "James E.J. Bottomley" Cc: "John W. Linville" Cc: "Paul E. McKenney" Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Chris Zankel Cc: Christoph Lameter Cc: Dmitry Torokhov Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Li Zefan Cc: Max Filippov Cc: Mike Travis Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Russell King Cc: Steffen Klassert Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 8 ++++++++ include/linux/nodemask.h | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0a9a6da21e74fd..39c6b6ec25b517 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -22,6 +22,14 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; */ #define cpumask_bits(maskp) ((maskp)->bits) +/** + * cpumask_pr_args - printf args to output a cpumask + * @maskp: cpumask to be printed + * + * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. + */ +#define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) + #if NR_CPUS == 1 #define nr_cpu_ids 1 #else diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 83a6aeda899d56..63295d2ce1540e 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -98,6 +98,14 @@ typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; +/** + * nodemask_pr_args - printf args to output a nodemask + * @maskp: nodemask to be printed + * + * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. + */ +#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits + /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions From 213468c892eb9e6460561d59bfe456c6d039914e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Oct 2014 17:40:52 +0300 Subject: [PATCH 079/420] sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl() Currently used hrtimer_try_to_cancel() is racy: raw_spin_lock(&rq->lock) ... dl_task_timer raw_spin_lock(&rq->lock) ... raw_spin_lock(&rq->lock) ... switched_from_dl() ... ... hrtimer_try_to_cancel() ... ... switched_to_fair() ... ... ... ... ... ... ... ... raw_spin_unlock(&rq->lock) ... (asquired) ... ... ... ... ... ... do_exit() ... ... schedule() ... ... raw_spin_lock(&rq->lock) ... raw_spin_unlock(&rq->lock) ... ... ... raw_spin_unlock(&rq->lock) ... raw_spin_lock(&rq->lock) ... ... (asquired) put_task_struct() ... ... free_task_struct() ... ... ... ... raw_spin_unlock(&rq->lock) ... (asquired) ... ... ... ... ... (use after free) ... So, let's implement 100% guaranteed way to cancel the timer and let's be sure we are safe even in very unlikely situations. rq unlocking does not limit the area of switched_from_dl() use, because this has already been possible in pull_dl_task() below. Let's consider the safety of of this unlocking. New code in the patch is working when hrtimer_try_to_cancel() fails. This means the callback is running. In this case hrtimer_cancel() is just waiting till the callback is finished. Two 1) Since we are in switched_from_dl(), new class is not dl_sched_class and new prio is not less MAX_DL_PRIO. So, the callback returns early; it's right after !dl_task() check. After that hrtimer_cancel() returns back too. The above is: raw_spin_lock(rq->lock); ... ... dl_task_timer() ... raw_spin_lock(rq->lock); switched_from_dl() ... hrtimer_try_to_cancel() ... raw_spin_unlock(rq->lock); ... hrtimer_cancel() ... ... raw_spin_unlock(rq->lock); ... return HRTIMER_NORESTART; ... ... raw_spin_lock(rq->lock); ... 2) But the below is also possible: dl_task_timer() raw_spin_lock(rq->lock); ... raw_spin_unlock(rq->lock); raw_spin_lock(rq->lock); ... switched_from_dl() ... hrtimer_try_to_cancel() ... ... return HRTIMER_NORESTART; raw_spin_unlock(rq->lock); ... hrtimer_cancel(); ... raw_spin_lock(rq->lock); ... In this case hrtimer_cancel() returns immediately. Very unlikely case, just to mention. Nobody can manipulate the task, because check_class_changed() is always called with pi_lock locked. Nobody can force the task to participate in (concurrent) priority inheritance schemes (the same reason). All concurrent task operations require pi_lock, which is held by us. No deadlocks with dl_task_timer() are possible, because it returns right after !dl_task() check (it does nothing). If we receive a new dl_task during the time of unlocked rq, we just don't have to do pull_dl_task() in switched_from_dl() further. Signed-off-by: Kirill Tkhai [ Added comments] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414420852.19914.186.camel@tkhai Signed-off-by: Ingo Molnar (cherry picked from commit 67dfa1b756f250972bde31d65e3f8fde6aeddc5b) Signed-off-by: Punit Agrawal --- kernel/sched/core.c | 4 ++++ kernel/sched/deadline.c | 34 +++++++++++++++++++++++++++------- kernel/sched/sched.h | 5 +++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be1bcd6b93251f..315050be118d49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 28fa9d9e92012a..df31e778d57a1e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -1586,10 +1581,35 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_dl_timer(rq, p); __dl_clear_params(p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2df8ef067cc54d..11dfe5161f561e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1127,6 +1127,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, From eb624b3e03cfd77d5eca68210ee79ec862862122 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 31 Oct 2014 06:39:32 +0800 Subject: [PATCH 080/420] sched/deadline: Fix artificial overrun introduced by yield_task_dl() The yield semantic of deadline class is to reduce remaining runtime to zero, and then update_curr_dl() will stop it. However, comsumed bandwidth is reduced from the budget of yield task again even if it has already been set to zero which leads to artificial overrun. This patch fix it by make sure we don't steal some more time from the task that yielded in update_curr_dl(). Suggested-by: Juri Lelli Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414708776-124078-2-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 804968809c321066cca028d4cbd533a420f964bc) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index df31e778d57a1e..81e9fb952ba3ed 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -628,7 +628,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) From aec2d1545335b2d4c47910008c8f6b1621e8d0f7 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 31 Oct 2014 06:39:35 +0800 Subject: [PATCH 081/420] sched/deadline: Reschedule from switched_from_dl() after a successful pull In switched_from_dl() we have to issue a resched if we successfully pulled some task from other cpus. This patch also aligns the behavior with -rt. Suggested-by: Juri Lelli Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414708776-124078-5-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar (cherry picked from commit cd66091162d34f589631a23bbe0ed214798245b4) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 81e9fb952ba3ed..3782f906b5fa42 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1619,8 +1619,11 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); #endif } From 359593ff643b7839cde5a852d1e8c3c70bfa4ff2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 31 Oct 2014 06:39:36 +0800 Subject: [PATCH 082/420] sched/deadline: Don't check CONFIG_SMP in switched_from_dl() There are both UP and SMP version of pull_dl_task(), so don't need to check CONFIG_SMP in switched_from_dl(); Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414708776-124078-6-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar (cherry picked from commit cad3bb32e181c286c46ec12b2deb1f26a6f9835d) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3782f906b5fa42..81a267d5f287f6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1613,7 +1613,6 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one @@ -1624,7 +1623,6 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (pull_dl_task(rq)) resched_curr(rq); -#endif } /* From 75de5855c46367c11e5328b3ff6e7885e56542ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 6 Jan 2015 12:02:46 -0500 Subject: [PATCH 083/420] cgroup: reorder SUBSYS(blkio) in cgroup_subsys.h The scheduled cgroup writeback support requires blkio to be initialized before memcg as memcg needs to provide certain blkcg related functionalities. Relocate blkio so that it's right above memory. Signed-off-by: Tejun Heo (cherry picked from commit 24dab7a7b3534ef40ecec20cfd7fb3ad99d9ff33) Signed-off-by: Punit Agrawal --- include/linux/cgroup_subsys.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 98c4f9b12b0374..e4a96fb1440356 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -15,6 +15,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_BLK_CGROUP) +SUBSYS(blkio) +#endif + #if IS_ENABLED(CONFIG_MEMCG) SUBSYS(memory) #endif @@ -31,10 +35,6 @@ SUBSYS(freezer) SUBSYS(net_cls) #endif -#if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) -#endif - #if IS_ENABLED(CONFIG_CGROUP_PERF) SUBSYS(perf_event) #endif From be355c1d8756e6d0f46c4a7c85d62afc8ab41347 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 15 Dec 2014 14:56:58 +0300 Subject: [PATCH 084/420] sched/fair: Fix sched_entity::avg::decay_count initialization Child has the same decay_count as parent. If it's not zero, we add it to parent's cfs_rq->removed_load: wake_up_new_task()->set_task_cpu()->migrate_task_rq_fair(). Child's load is a just garbade after copying of parent, it hasn't been on cfs_rq yet, and it must not be added to cfs_rq::removed_load in migrate_task_rq_fair(). The patch moves sched_entity::avg::decay_count intialization in sched_fork(). So, migrate_task_rq_fair() does not change removed_load. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418644618.6074.13.camel@tkhai Signed-off-by: Ingo Molnar (cherry picked from commit bb04159df99fa353d0fb524574aca03ce2c6515b) Signed-off-by: Ricky Liang --- kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 315050be118d49..0f4b3c4026a4c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1826,6 +1826,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SMP + p->se.avg.decay_count = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f15be8f42a8067..4d35c2889d4abb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p) { u32 slice; - p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; p->se.avg.runnable_avg_sum = slice; p->se.avg.runnable_avg_period = slice; From 78fe8035ff76756e1f22cb30895e193d201908b3 Mon Sep 17 00:00:00 2001 From: Yao Dongdong Date: Mon, 29 Dec 2014 14:41:43 +0800 Subject: [PATCH 085/420] sched/core: Remove check of p->sched_class Search all usage of p->sched_class in sched/core.c, no one check it before use, so it seems that every task must belong to one sched_class. Signed-off-by: Yao Dongdong [ Moved the early class assignment to make it boot. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1419835303-28958-1-git-send-email-yaodongdong@huawei.com Signed-off-by: Ingo Molnar (cherry picked from commit 1b537c7d1e58c761212a193085f9049b58f672e6) Signed-off-by: Ricky Liang --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0f4b3c4026a4c1..cea9fc1cb67fa3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4691,7 +4691,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); @@ -7148,6 +7148,11 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -7158,11 +7163,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ From b424057508bc5afc99b204915e36aea63e689565 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jan 2015 11:18:10 +0100 Subject: [PATCH 086/420] sched/core: Validate rq_clock*() serialization rq->clock{,_task} are serialized by rq->lock, verify this. One immediate fail is the usage in scale_rt_capability, so 'annotate' that for now, there's more 'funny' there. Maybe change rq->lock into a raw_seqlock_t? (Only 32-bit is affected) Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20150105103554.361872747@infradead.org Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Signed-off-by: Ingo Molnar (cherry picked from commit cebde6d681aa45f96111cfcffc1544cf2a0454ff) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4d35c2889d4abb..bb8ce942c2082c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5765,8 +5765,8 @@ static unsigned long scale_rt_capacity(int cpu) */ age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; - delta = rq_clock(rq) - age_stamp; if (unlikely(delta < 0)) delta = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 11dfe5161f561e..5982352c6a8769 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -668,13 +668,20 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return ACCESS_ONCE(rq->clock); +} + static inline u64 rq_clock(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock_task; } From 8ceaa55bb3503a83122320f994cebb951a2c4564 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2015 15:08:03 +0100 Subject: [PATCH 087/420] sched/deadline: Fix deadline parameter modification handling Commit 67dfa1b756f2 ("sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl()") removed the hrtimer_try_cancel() function call out from init_dl_task_timer(), which gets called from __setparam_dl(). The result is that we can now re-init the timer while its active -- this is bad and corrupts timer state. Furthermore; changing the parameters of an active deadline task is tricky in that you want to maintain guarantees, while immediately effective change would allow one to circumvent the CBS guarantees -- this too is bad, as one (bad) task should not be able to affect the others. Rework things to avoid both problems. We only need to initialize the timer once, so move that to __sched_fork() for new tasks. Then make sure __setparam_dl() doesn't affect the current running state but only updates the parameters used to calculate the next scheduling period -- this guarantees the CBS functions as expected (albeit slightly pessimistic). This however means we need to make sure __dl_clear_params() needs to reset the active state otherwise new (and tasks flipping between classes) will not properly (re)compute their first instance. Todo: close class flipping CBS hole. Todo: implement delayed BW release. Reported-by: Luca Abeni Acked-by: Juri Lelli Tested-by: Luca Abeni Fixes: 67dfa1b756f2 ("sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl()") Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150128140803.GF23038@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar (cherry picked from commit 40767b0dc768060266d261b4a330164b4be53f7c) Signed-off-by: Punit Agrawal --- kernel/sched/core.c | 33 ++++++++++++++++++++++++++++----- kernel/sched/deadline.c | 3 ++- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cea9fc1cb67fa3..5b21b3025a6063 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1808,6 +1808,10 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } /* @@ -1836,7 +1840,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif RB_CLEAR_NODE(&p->dl.rb_node); - hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + init_dl_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2067,6 +2071,9 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -3279,15 +3286,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; - init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + + /* + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. + */ } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 81a267d5f287f6..a99ce981820474 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1111,6 +1111,7 @@ static void task_dead_dl(struct task_struct *p) * Since we are TASK_DEAD we won't slip out of the domain! */ raw_spin_lock_irq(&dl_b->lock); + /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); @@ -1609,8 +1610,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p) static void switched_from_dl(struct rq *rq, struct task_struct *p) { + /* XXX we should retain the bw until 0-lag */ cancel_dl_timer(rq, p); - __dl_clear_params(p); /* From c4240a30ff640945b87dfebb0e930ea44c8ef78a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Nov 2014 08:44:03 +0800 Subject: [PATCH 088/420] sched/deadline: Fix stale yield state When we fail to start the deadline timer in update_curr_dl(), we forget to clear ->dl_yielded, resulting in wrecked time keeping. Since the natural place to clear both ->dl_yielded and ->dl_throttled is in replenish_dl_entity(); both are after all waiting for that event; make it so. Luckily since 67dfa1b756f2 ("sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl()") the task_on_rq_queued() condition in dl_task_timer() must be true, and can therefore call enqueue_task_dl() unconditionally. Reported-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1416962647-76792-4-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 1019a359d3dc4b64d0e1e5a5efcb725d5e83994d) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a99ce981820474..22b53ffa78ada8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } + + if (dl_se->dl_yielded) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; } /* @@ -536,23 +541,19 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - if (task_on_rq_queued(p)) { - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); #ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) - push_dl_task(rq); + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); #endif - } unlock: raw_spin_unlock(&rq->lock); @@ -630,10 +631,9 @@ static void update_curr_dl(struct rq *rq) dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { + dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) - dl_se->dl_throttled = 1; - else + if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -870,7 +870,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. */ - if (p->dl.dl_throttled) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; enqueue_dl_entity(&p->dl, pi_se, flags); From 192244a4b51ce9d17a1fa20168938b9271721a46 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 16 Feb 2015 15:38:34 +0300 Subject: [PATCH 089/420] sched/dl: Prevent enqueue of a sleeping task in dl_task_timer() A deadline task may be throttled and dequeued at the same time. This happens, when it becomes throttled in schedule(), which is called to go to sleep: current->state = TASK_INTERRUPTIBLE; schedule() deactivate_task() dequeue_task_dl() update_curr_dl() start_dl_timer() __dequeue_task_dl() prev->on_rq = 0; Later the timer fires, but the task is still dequeued: dl_task_timer() enqueue_task_dl() /* queues on dl_rq; on_rq remains 0 */ Someone wakes it up: try_to_wake_up() enqueue_dl_entity() BUG_ON(on_dl_rq()) Patch fixes this problem, it prevents queueing !on_rq tasks on dl_rq. Reported-by: Fengguang Wu Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) [ Wrote comment. ] Cc: Juri Lelli Fixes: 1019a359d3dc ("sched/deadline: Fix stale yield state") Link: http://lkml.kernel.org/r/1374601424090314@web4j.yandex.ru Signed-off-by: Ingo Molnar (cherry picked from commit a79ec89fd8459f0de850898f432a2a57d60e64de) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 22b53ffa78ada8..90ffb8a7167f2a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -541,6 +541,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); + + /* + * If the throttle happened during sched-out; like: + * + * schedule() + * deactivate_task() + * dequeue_task_dl() + * update_curr_dl() + * start_dl_timer() + * __dequeue_task_dl() + * prev->on_rq = 0; + * + * We can be both throttled and !queued. Replenish the counter + * but do not enqueue -- wait for our wakeup to do that. + */ + if (!task_on_rq_queued(p)) { + replenish_dl_entity(dl_se, dl_se); + goto unlock; + } + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); From fa08e980c517811af2f75f9272be86334c34cc3b Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 27 Mar 2015 07:08:35 +0800 Subject: [PATCH 090/420] sched/deadline: Support DL task migration during CPU hotplug I observed that DL tasks can't be migrated to other CPUs during CPU hotplug, in addition, task may/may not be running again if CPU is added back. The root cause which I found is that DL tasks will be throtted and removed from the DL rq after comsuming all their budget, which leads to the situation that stop task can't pick them up from the DL rq and migrate them to other CPUs during hotplug. The method to reproduce: schedtool -E -t 50000:100000 -e ./test Actually './test' is just a simple for loop. Then observe which CPU the test task is on and offline it: echo 0 > /sys/devices/system/cpu/cpuN/online This patch adds the DL task migration during CPU hotplug by finding a most suitable later deadline rq after DL timer fires if current rq is offline. If it fails to find a suitable later deadline rq then it falls back to any eligible online CPU in so that the deadline task will come back to us, and the push/pull mechanism should then move it around properly. Suggested-and-Acked-by: Juri Lelli Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1427411315-4298-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar (cherry picked from commit fa9c9d10e97e38d9903fad1829535175ad261f45) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 90ffb8a7167f2a..4e0383ad787e3f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) rq->post_schedule = has_pushable_dl_tasks(rq); } +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); + +static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +{ + struct rq *later_rq = NULL; + bool fallback = false; + + later_rq = find_lock_later_rq(p, rq); + + if (!later_rq) { + int cpu; + + /* + * If we cannot preempt any rq, fall back to pick any + * online cpu. + */ + fallback = true; + cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + if (cpu >= nr_cpu_ids) { + /* + * Fail to find any suitable cpu. + * The task will never come back! + */ + BUG_ON(dl_bandwidth_enabled()); + + /* + * If admission control is disabled we + * try a little harder to let the task + * run. + */ + cpu = cpumask_any(cpu_active_mask); + } + later_rq = cpu_rq(cpu); + double_lock_balance(rq, later_rq); + } + + deactivate_task(rq, p, 0); + set_task_cpu(p, later_rq->cpu); + activate_task(later_rq, p, ENQUEUE_REPLENISH); + + if (!fallback) + resched_curr(later_rq); + + double_unlock_balance(rq, later_rq); +} + #else static inline @@ -542,6 +588,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); +#ifdef CONFIG_SMP + /* + * If we find that the rq the task was on is no longer + * available, we need to select a new rq. + */ + if (unlikely(!rq->online)) { + dl_task_offline_migration(rq, p); + goto unlock; + } +#endif + /* * If the throttle happened during sched-out; like: * From 6379aafb5fb5c74ec02fa6c2ccaf5440146a624b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:06 +0000 Subject: [PATCH 091/420] sched: deadline: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.627353666@linutronix.de Signed-off-by: Thomas Gleixner (cherry picked from commit cc9684d3c1188ac5f1cf0ee9f8be7ba456099d7b) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4e0383ad787e3f..edaec487210524 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -503,8 +503,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; - ktime_t soft, hard; - unsigned long range; s64 delta; if (boosted) @@ -527,15 +525,9 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) if (ktime_us_delta(act, now) < 0) return 0; - hrtimer_set_expires(&dl_se->dl_timer, act); + hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS); - soft = hrtimer_get_softexpires(&dl_se->dl_timer); - hard = hrtimer_get_expires(&dl_se->dl_timer); - range = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&dl_se->dl_timer, soft, - range, HRTIMER_MODE_ABS, 0); - - return hrtimer_active(&dl_se->dl_timer); + return 1; } /* From 012ff5df7ecdc409c6502f92548a793a9bcdda24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Apr 2015 13:19:42 +0200 Subject: [PATCH 092/420] sched: Move the loadavg code to a more obvious location I could not find the loadavg code.. turns out it was hidden in a file called proc.c. It further got mingled up with the cruft per rq load indexes (which we really want to get rid of). Move the per rq load indexes into the fair.c load-balance code (that's the only thing that uses them) and rename proc.c to loadavg.c so we can find it again. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Paul Gortmaker Cc: Thomas Gleixner [ Did minor cleanups to the code. ] Signed-off-by: Ingo Molnar (cherry picked from commit 3289bdb429884c0279bf9ab72dff7b934f19dfc6) Signed-off-by: Ricky Liang --- include/linux/sched.h | 5 + kernel/sched/Makefile | 2 +- kernel/sched/core.c | 7 +- kernel/sched/fair.c | 183 ++++++++++++++++++++++ kernel/sched/{proc.c => loadavg.c} | 236 +++-------------------------- kernel/sched/sched.h | 8 +- 6 files changed, 222 insertions(+), 219 deletions(-) rename kernel/sched/{proc.c => loadavg.c} (62%) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4169de53eae3a7..2048a4b51d5ea5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -174,7 +174,12 @@ extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void update_cpu_load_nohz(void); +#else +static inline void update_cpu_load_nohz(void) { } +#endif extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ab32b7b0db5c6b..4b6ceef49530c6 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b21b3025a6063..f0aab59dabd38f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2447,9 +2447,9 @@ unsigned long nr_iowait_cpu(int cpu) void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -2547,6 +2547,7 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb8ce942c2082c..1e6d8d63eed78d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4068,6 +4068,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* + * per rq 'load' arrray crap; XXX kill this. + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = this_rq->cfs.runnable_load_avg; + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); +} + /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c similarity index 62% rename from kernel/sched/proc.c rename to kernel/sched/loadavg.c index 8ecd552fe4f222..ef7159012cf366 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/loadavg.c @@ -1,7 +1,9 @@ /* - * kernel/sched/proc.c + * kernel/sched/loadavg.c * - * Kernel load calculations, forked from sched/core.c + * This file contains the magic bits required to compute the global loadavg + * figure. Its a silly number but people think its important. We go through + * great pains to make it work on big machines and tickless kernels. */ #include @@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq) long nr_active, delta = 0; nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -186,6 +188,7 @@ void calc_load_enter_idle(void) delta = calc_load_fold_active(this_rq); if (delta) { int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); } } @@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { unsigned long result = 1UL << frac_bits; - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; } return result; @@ -285,7 +290,6 @@ static unsigned long calc_load_n(unsigned long load, unsigned long exp, unsigned long active, unsigned int n) { - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } @@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { } /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. + * + * Called from the global timer code. */ void calc_global_load(unsigned long ticks) { @@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks) } /* - * Called from update_cpu_load() to periodically update this CPU's + * Called from scheduler_tick() to periodically update this CPU's * active count. */ -static void calc_load_account_active(struct rq *this_rq) +void calc_global_load_tick(struct rq *this_rq) { long delta; @@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq) this_rq->calc_load_update += LOAD_FREQ; } - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ */ - -/* - * Called from scheduler_tick() - */ -void update_cpu_load_active(struct rq *this_rq) -{ - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); - - calc_load_account_active(this_rq); -} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5982352c6a8769..2930b78e9232a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -25,8 +25,14 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); + +#ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +#else +static inline void update_cpu_load_active(struct rq *this_rq) { } +#endif /* * Helpers for converting nanosecond timing to jiffy resolution @@ -1231,8 +1237,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); - extern void init_task_runnable_average(struct task_struct *p); static inline void add_nr_running(struct rq *rq, unsigned count) From f9363a050dc138972fb81486c41b6f302cc990e9 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:20 -0700 Subject: [PATCH 093/420] sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE() ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use the new READ_ONCE() and WRITE_ONCE() APIs as appropriate. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Acked-by: Waiman Long Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar (cherry picked from commit 316c1608d15c736439d4065ed12f306db554b3da) Signed-off-by: Ricky Liang --- include/linux/sched.h | 4 ++-- kernel/fork.c | 2 +- kernel/sched/auto_group.c | 2 +- kernel/sched/auto_group.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/cputime.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 18 +++++++++--------- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/wait.c | 4 ++-- kernel/time/posix-cpu-timers.c | 8 ++++---- 12 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2048a4b51d5ea5..4505143a78fe93 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3021,13 +3021,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) diff --git a/kernel/fork.c b/kernel/fork.c index d3a81415925cb3..bf4cfe95f3c50f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1054,7 +1054,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 8a2e230fb86ad4..077976d30e8843 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -145,7 +145,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + if (!READ_ONCE(sysctl_sched_autogroup_enabled)) goto out; for_each_thread(p, t) diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816de..890c95f2587a4d 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0aab59dabd38f..6388e939a9c99f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -561,7 +561,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) @@ -2576,7 +2576,7 @@ void scheduler_tick(void) u64 scheduler_tick_max_deferment(void) { struct rq *rq = this_rq(); - unsigned long next, now = ACCESS_ONCE(jiffies); + unsigned long next, now = READ_ONCE(jiffies); next = rq->last_sched_tick + HZ; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b293203ff2564d..bc285836e44c72 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -578,7 +578,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new) { cputime_t old; - while (new > (old = ACCESS_ONCE(*counter))) + while (new > (old = READ_ONCE(*counter))) cmpxchg_cputime(counter, old, new); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edaec487210524..59833894b57d74 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1003,7 +1003,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If we are dealing with a -deadline task, we must diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e6d8d63eed78d..311ce9ac83615f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -832,7 +832,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { - unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; @@ -1588,7 +1588,7 @@ static void task_numa_placement(struct task_struct *p) u64 runtime, period; spinlock_t *group_lock = NULL; - seq = ACCESS_ONCE(p->mm->numa_scan_seq); + seq = READ_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; @@ -1723,7 +1723,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, } rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + tsk = READ_ONCE(cpu_rq(cpu)->curr); if (!cpupid_match_pid(tsk, cpupid)) goto no_join; @@ -1905,7 +1905,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) static void reset_ptenuma_scan(struct task_struct *p) { - ACCESS_ONCE(p->mm->numa_scan_seq)++; + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); p->mm->numa_scan_offset = 0; } @@ -4196,7 +4196,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, */ static void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long load = this_rq->cfs.runnable_load_avg; unsigned long pending_updates; @@ -4218,7 +4218,7 @@ static void update_idle_cpu_load(struct rq *this_rq) void update_cpu_load_nohz(void) { struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) @@ -4298,7 +4298,7 @@ static unsigned long capacity_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); + unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -5946,8 +5946,8 @@ static unsigned long scale_rt_capacity(int cpu) * Since we're reading these variables without serialization make sure * we read them once before doing sanity checks on them. */ - age_stamp = ACCESS_ONCE(rq->age_stamp); - avg = ACCESS_ONCE(rq->rt_avg); + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); delta = __rq_clock_broken(rq) - age_stamp; if (unlikely(delta < 0)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 20bca398084ae7..6fba352232f92d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1311,7 +1311,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If the current task on @p's runqueue is an RT task, then diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2930b78e9232a3..a4d198da1c77b6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -676,7 +676,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline u64 __rq_clock_broken(struct rq *rq) { - return ACCESS_ONCE(rq->clock); + return READ_ONCE(rq->clock); } static inline u64 rq_clock(struct rq *rq) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a885..3f6ef481a1a596 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -535,7 +535,7 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) @@ -547,7 +547,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a16b67859e2a79..9e5b41dc56e805 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -852,10 +852,10 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -958,11 +958,11 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); cputime_t x; if (psecs >= hard) { /* From a1f86eef2f2a65758ba02e86978cb0f17c5cf5a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:49 +0200 Subject: [PATCH 094/420] sched,dl: Fix sched class hopping CBS hole We still have a few pending issues with the deadline code, one of which is that switching between scheduling classes can 'leak' CBS state. Close the hole by retaining the current CBS state when leaving SCHED_DEADLINE and unconditionally programming the deadline timer. The timer will then reset the CBS state if the task is still !SCHED_DEADLINE by the time it hits. If the task left SCHED_DEADLINE it will not call task_dead_dl() and we'll not cancel the hrtimer, leaving us a pending timer in free space. Avoid this by giving the timer a task reference, this avoids littering the task exit path for this rather uncommon case. In order to do this, I had to move dl_task_offline_migration() below the replenishment, such that the task_rq()->lock fully covers that. While doing this, I noticed that it (was) buggy in assuming a task is enqueued and or we need to enqueue the task now. Fixing this means select_task_rq_dl() might encounter an offline rq -- look into that. As a result this kills cancel_dl_timer() which included a rq->lock break. Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling") Cc: Wanpeng Li Cc: Luca Abeni Cc: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: Luca Abeni Cc: Juri Lelli Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.574192138@infradead.org Signed-off-by: Thomas Gleixner (cherry picked from commit a649f237db18450de767d70f40a41d5dbd0291de) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 152 +++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 66 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 59833894b57d74..19a26186e074c7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -220,7 +220,7 @@ static inline void set_post_schedule(struct rq *rq) static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); -static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; bool fallback = false; @@ -254,14 +254,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) double_lock_balance(rq, later_rq); } + /* + * By now the task is replenished and enqueued; migrate it. + */ deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, ENQUEUE_REPLENISH); + activate_task(later_rq, p, 0); if (!fallback) resched_curr(later_rq); - double_unlock_balance(rq, later_rq); + double_unlock_balance(later_rq, rq); + + return later_rq; } #else @@ -498,22 +503,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +static int start_dl_timer(struct task_struct *p) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->dl_timer; + struct rq *rq = task_rq(p); ktime_t now, act; s64 delta; - if (boosted) - return 0; + lockdep_assert_held(&rq->lock); + /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. */ act = ns_to_ktime(dl_se->deadline); - now = hrtimer_cb_get_time(&dl_se->dl_timer); + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -525,7 +531,19 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) if (ktime_us_delta(act, now) < 0) return 0; - hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS); + /* + * !enqueued will guarantee another callback; even if one is already in + * progress. This ensures a balanced {get,put}_task_struct(). + * + * The race against __run_timer() clearing the enqueued state is + * harmless because we're holding task_rq()->lock, therefore the timer + * expiring after we've done the check will wait on its task_rq_lock() + * and observe our state. + */ + if (!hrtimer_is_queued(timer)) { + get_task_struct(p); + hrtimer_start(timer, act, HRTIMER_MODE_ABS); + } return 1; } @@ -561,35 +579,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) } /* - * We need to take care of several possible races here: - * - * - the task might have changed its scheduling policy - * to something different than SCHED_DEADLINE - * - the task might have changed its reservation parameters - * (through sched_setattr()) - * - the task might have been boosted by someone else and - * might be in the boosting/deboosting path + * The task might have changed its scheduling policy to something + * different than SCHED_DEADLINE (through switched_fromd_dl()). + */ + if (!dl_task(p)) { + __dl_clear_params(p); + goto unlock; + } + + /* + * This is possible if switched_from_dl() raced against a running + * callback that took the above !dl_task() path and we've since then + * switched back into SCHED_DEADLINE. * - * In all this cases we bail out, as the task is already - * in the runqueue or is going to be enqueued back anyway. + * There's nothing to do except drop our task reference. */ - if (!dl_task(p) || dl_se->dl_new || - dl_se->dl_boosted || !dl_se->dl_throttled) + if (dl_se->dl_new) goto unlock; - sched_clock_tick(); - update_rq_clock(rq); + /* + * The task might have been boosted by someone else and might be in the + * boosting/deboosting path, its not throttled. + */ + if (dl_se->dl_boosted) + goto unlock; -#ifdef CONFIG_SMP /* - * If we find that the rq the task was on is no longer - * available, we need to select a new rq. + * Spurious timer due to start_dl_timer() race; or we already received + * a replenishment from rt_mutex_setprio(). */ - if (unlikely(!rq->online)) { - dl_task_offline_migration(rq, p); + if (!dl_se->dl_throttled) goto unlock; - } -#endif + + sched_clock_tick(); + update_rq_clock(rq); /* * If the throttle happened during sched-out; like: @@ -615,17 +638,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + #ifdef CONFIG_SMP /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. + * Perform balancing operations here; after the replenishments. We + * cannot drop rq->lock before this, otherwise the assertion in + * start_dl_timer() about not missing updates is not true. + * + * If we find that the rq the task was on is no longer available, we + * need to select a new rq. + * + * XXX figure out if select_task_rq_dl() deals with offline cpus. + */ + if (unlikely(!rq->online)) + rq = dl_task_offline_migration(rq, p); + + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. */ if (has_pushable_dl_tasks(rq)) push_dl_task(rq); #endif + unlock: raw_spin_unlock(&rq->lock); + /* + * This can free the task_struct, including this hrtimer, do not touch + * anything related to that after this. + */ + put_task_struct(p); + return HRTIMER_NORESTART; } @@ -702,7 +746,7 @@ static void update_curr_dl(struct rq *rq) if (dl_runtime_exceeded(rq, dl_se)) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) + if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1173,7 +1217,6 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { - struct hrtimer *timer = &p->dl.dl_timer; struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); /* @@ -1183,8 +1226,6 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); - - hrtimer_cancel(timer); } static void set_curr_task_dl(struct rq *rq) @@ -1651,37 +1692,16 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ -/* - * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. - */ -static void cancel_dl_timer(struct rq *rq, struct task_struct *p) -{ - struct hrtimer *dl_timer = &p->dl.dl_timer; - - /* Nobody will change task's class if pi_lock is held */ - lockdep_assert_held(&p->pi_lock); - - if (hrtimer_active(dl_timer)) { - int ret = hrtimer_try_to_cancel(dl_timer); - - if (unlikely(ret == -1)) { - /* - * Note, p may migrate OR new deadline tasks - * may appear in rq when we are unlocking it. - * A caller of us must be fine with that. - */ - raw_spin_unlock(&rq->lock); - hrtimer_cancel(dl_timer); - raw_spin_lock(&rq->lock); - } - } -} - static void switched_from_dl(struct rq *rq, struct task_struct *p) { - /* XXX we should retain the bw until 0-lag */ - cancel_dl_timer(rq, p); - __dl_clear_params(p); + /* + * Start the deadline timer; if we switch back to dl before this we'll + * continue consuming our current CBS slice. If we stay outside of + * SCHED_DEADLINE until the deadline passes, the timer will reset the + * task. + */ + if (!start_dl_timer(p)) + __dl_clear_params(p); /* * Since this might be the only -deadline task on the rq, From 71a2f52d7164586f22b21a331291ab6e34d915b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:50 +0200 Subject: [PATCH 095/420] sched: Move code around In preparation to reworking set_cpus_allowed_ptr() move some code around. This also removes some superfluous #ifdefs and adds comments to some #endifs. text data bss dec hex filename 12211532 1738144 1081344 15031020 e55aec defconfig-build/vmlinux.pre 12211532 1738144 1081344 15031020 e55aec defconfig-build/vmlinux.post Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.662086684@infradead.org Signed-off-by: Thomas Gleixner (cherry picked from commit 5cc389bcee088b72c8c34a01d596412cab4f3f78) Signed-off-by: Ricky Liang --- kernel/sched/core.c | 364 ++++++++++++++++++++++---------------------- 1 file changed, 178 insertions(+), 186 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6388e939a9c99f..1529a3583d04f1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1050,6 +1050,180 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ + +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + struct rq *rq; + int ret = 0; + + if (unlikely(!cpu_active(dest_cpu))) + return ret; + + rq = cpu_rq(src_cpu); + + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto done; + + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + goto fail; + + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); +done: + ret = 1; +fail: + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); + return ret; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); + __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); + local_irq_enable(); + return 0; +} + +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; + } + + do_set_cpus_allowed(p, new_mask); + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (task_running(rq, p) || p->state == TASK_WAKING) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); +out: + task_rq_unlock(rq, p, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -1190,13 +1364,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) return ret; } -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); - /* * wait_task_inactive - wait for a thread to unschedule. * @@ -1329,9 +1496,7 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); -#endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ @@ -1435,7 +1600,7 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } -#endif +#endif /* CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -4686,149 +4851,6 @@ void init_idle(struct task_struct *idle, int cpu) } #ifdef CONFIG_SMP -/* - * move_queued_task - move a queued task to new rq. - * - * Returns (locked) new rq. Old rq's lock is released. - */ -static struct rq *move_queued_task(struct task_struct *p, int new_cpu) -{ - struct rq *rq = task_rq(p); - - lockdep_assert_held(&rq->lock); - - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(new_cpu); - - raw_spin_lock(&rq->lock); - BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - - return rq; -} - -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - - cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); -} - -/* - * This is how migration works: - * - * 1) we invoke migration_cpu_stop() on the target CPU using - * stop_one_cpu(). - * 2) stopper starts to run (implicitly forcing the migrated thread - * off the CPU) - * 3) it checks whether the migrated task is still in the wrong runqueue. - * 4) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 5) stopper completes and stop_one_cpu() returns and the migration - * is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - unsigned long flags; - struct rq *rq; - unsigned int dest_cpu; - int ret = 0; - - rq = task_rq_lock(p, &flags); - - if (cpumask_equal(&p->cpus_allowed, new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } - - do_set_cpus_allowed(p, new_mask); - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - return 0; - } else if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -out: - task_rq_unlock(rq, p, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq; - int ret = 0; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq = cpu_rq(src_cpu); - - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - goto fail; - - /* - * If we're not on a rq, the next wake-up will ensure we're - * placed properly. - */ - if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -done: - ret = 1; -fail: - raw_spin_unlock(&rq->lock); - raw_spin_unlock(&p->pi_lock); - return ret; -} #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ @@ -4876,35 +4898,9 @@ void sched_setnuma(struct task_struct *p, int nid) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } -#endif - -/* - * migration_cpu_stop - this will be executed by a highprio stopper thread - * and performs thread migration by bumping thread off CPU then - * 'pushing' onto another runqueue. - */ -static int migration_cpu_stop(void *data) -{ - struct migration_arg *arg = data; - - /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. - */ - local_irq_disable(); - /* - * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed - * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. - */ - sched_ttwu_pending(); - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); - local_irq_enable(); - return 0; -} +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU - /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -5007,7 +5003,6 @@ static void migrate_tasks(unsigned int dead_cpu) rq->stop = stop; } - #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5186,7 +5181,7 @@ static void register_sched_domain_sysctl(void) static void unregister_sched_domain_sysctl(void) { } -#endif +#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void set_rq_online(struct rq *rq) { @@ -5356,9 +5351,6 @@ static int __init migration_init(void) return 0; } early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ @@ -6545,7 +6537,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_group *sg; struct sched_group_capacity *sgc; - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sd) return -ENOMEM; From 4ab5172a321d61236d7bb5f456f6745e1e9dc404 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 2 Jul 2015 22:25:52 +0800 Subject: [PATCH 096/420] sched/fair: Clean up the __sched_period() code Since commit: 4bf0b77158 ("sched: remove do_div() from __sched_slice()") ... the logic of __sched_period() can be implemented as a single if-else without any local variables, so this patch cleans it up with an if-else statement, which expresses the function's logic straightforwardly. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435847152-29543-1-git-send-email-boqun.feng@gmail.com Signed-off-by: Ingo Molnar (cherry picked from commit 8e2b0bf397279878babcb39b021edcafe7c945eb) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 311ce9ac83615f..3a3d5bad58c9aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -616,15 +616,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) */ static u64 __sched_period(unsigned long nr_running) { - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; + if (unlikely(nr_running > sched_nr_latency)) + return nr_running * sysctl_sched_min_granularity; + else + return sysctl_sched_latency; } /* From a78662a5a69df703509e2dccbcd39d5a286824f3 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 6 Jul 2015 21:51:02 +0900 Subject: [PATCH 097/420] sched/fair: Fix a comment reflecting function name change update_cfs_rq_load_contribution() was changed to __update_cfs_rq_tg_load_contrib() - sync up the commit in calc_tg_weight() too. Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1436187062-19658-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit 399595f248cb25dccb6044b53c47c44c174dc23d) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a3d5bad58c9aa..186f582dac763d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2116,7 +2116,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) /* * Use this CPU's actual weight instead of the last load_contribution * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). + * __update_cfs_rq_tg_load_contrib(). */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_contrib; From bbe04eba39fcab9f6ebcd80bbd067584bd68abdb Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 14 Jul 2015 17:39:50 +0200 Subject: [PATCH 098/420] sched/fair: Beef up wake_wide() Josef Bacik reported that Facebook sees better performance with their 1:N load (1 dispatch/node, N workers/node) when carrying an old patch to try very hard to wake to an idle CPU. While looking at wake_wide(), I noticed that it doesn't pay attention to the wakeup of a many partner waker, returning 1 only when waking one of its many partners. Correct that, letting explicit domain flags override the heuristic. While at it, adjust task_struct bits, we don't need a 64-bit counter. Tested-by: Josef Bacik Signed-off-by: Mike Galbraith [ Tidy things up. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team Cc: morten.rasmussen@arm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com Signed-off-by: Ingo Molnar (cherry picked from commit 63b0e9edceec10fa41ec33393a1515a5ff444277) Signed-off-by: Punit Agrawal --- include/linux/sched.h | 4 +-- kernel/sched/fair.c | 67 +++++++++++++++++++++---------------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4505143a78fe93..2669f5109ea6f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,9 +1247,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; - struct task_struct *last_wakee; - unsigned long wakee_flips; + unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; int wake_cpu; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 186f582dac763d..fb5cfb6a3241bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4457,26 +4457,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. Waker/wakee + * being client/server, worker/dispatcher, interrupt source or whatever is + * irrelevant, spread criteria is apparent partner count exceeds socket size. + */ static int wake_wide(struct task_struct *p) { + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; - } - - return 0; + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4488,13 +4491,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4721,7 +4717,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; + int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; @@ -4729,12 +4725,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; + break; /* * If both cpu and prev_cpu are part of this domain, @@ -4748,17 +4744,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; + else if (!want_affine) + break; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; + if (affine_sd) { + sd = NULL; /* Prefer wake_affine over balance flags */ + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + new_cpu = cpu; } - while (sd) { + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, new_cpu); + + } else while (sd) { struct sched_group *group; int weight; @@ -4792,7 +4792,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; From f4a3b945cf2665a0f6772e3c542f50d2611b1e78 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 16 Dec 2014 23:58:29 +0800 Subject: [PATCH 099/420] sched/fair: Fix the dealing with decay_count in __synchronize_entity_decay() In __synchronize_entity_decay(), if "decays" happens to be zero, se->avg.decay_count will not be zeroed, holding the positive value assigned when dequeued last time. This is problematic in the following case: If this runnable task is CFS-balanced to other CPUs soon afterwards, migrate_task_rq_fair() will treat it as a blocked task due to its non-zero decay_count, thereby adding its load to cfs_rq->removed_load wrongly. Thus, we must zero se->avg.decay_count in this case as well. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418745509-2609-1-git-send-email-pang.xunlei@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fb5cfb6a3241bf..0ddd5ed28cbd9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2385,11 +2385,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; + se->avg.decay_count = 0; if (!decays) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.decay_count = 0; return decays; } From 02189e12e7936e4038a7112d25a94b8036711081 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:04 +0100 Subject: [PATCH 100/420] sched: Add sched_avg::utilization_avg_contrib Add new statistics which reflect the average time a task is running on the CPU and the sum of these running time of the tasks on a runqueue. The latter is named utilization_load_avg. This patch is based on the usage metric that was proposed in the 1st versions of the per-entity load tracking patchset by Paul Turner but that has be removed afterwards. This version differs from the original one in the sense that it's not linked to task_group. The rq's utilization_load_avg will be used to check if a rq is overloaded or not instead of trying to compute how many tasks a group of CPUs can handle. Rename runnable_avg_period into avg_period as it is now used with both runnable_avg_sum and running_avg_sum. Add some descriptions of the variables to explain their differences. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 21 +++++++++--- kernel/sched/debug.c | 10 ++++-- kernel/sched/fair.c | 74 +++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 8 ++++- 4 files changed, 89 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2669f5109ea6f0..d7c1a7ac4756fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1077,15 +1077,28 @@ struct load_weight { }; struct sched_avg { + u64 last_runnable_update; + s64 decay_count; + /* + * utilization_avg_contrib describes the amount of time that a + * sched_entity is running on a CPU. It is based on running_avg_sum + * and is scaled in the range [0..SCHED_LOAD_SCALE]. + * load_avg_contrib described the amount of time that a sched_entity + * is runnable on a rq. It is based on both runnable_avg_sum and the + * weight of the task. + */ + unsigned long load_avg_contrib, utilization_avg_contrib; /* * These sums represent an infinite geometric series and so are bound * above by 1024/(1-y). Thus we only need a u32 to store them for all * choices of y < 1-2^(-32)*1024. + * running_avg_sum reflects the time that the sched_entity is + * effectively running on the CPU. + * runnable_avg_sum represents the amount of time a sched_entity is on + * a runqueue which includes the running time that is monitored by + * running_avg_sum. */ - u32 runnable_avg_sum, runnable_avg_period; - u64 last_runnable_update; - s64 decay_count; - unsigned long load_avg_contrib; + u32 runnable_avg_sum, avg_period, running_avg_sum; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ce33780d8f2012..f384452ff589c2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,7 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); P(se->avg.decay_count); #endif @@ -214,6 +214,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -628,8 +630,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0ddd5ed28cbd9b..97d680264ab9e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -665,6 +665,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) @@ -672,9 +673,10 @@ void init_task_runnable_average(struct task_struct *p) u32 slice; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -1565,7 +1567,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -2311,7 +2313,8 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; @@ -2337,7 +2340,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2350,7 +2353,9 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w; + sa->avg_period += delta_w; delta -= delta_w; @@ -2360,20 +2365,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta; + sa->avg_period += delta; return decayed; } @@ -2390,6 +2401,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); return decays; } @@ -2425,7 +2438,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2478,7 +2491,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, + runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2496,7 +2510,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2515,6 +2529,27 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2531,7 +2566,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; u64 now; /* @@ -2543,18 +2578,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2629,6 +2668,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2647,6 +2687,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -2985,6 +3026,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a4d198da1c77b6..e69afaf49e8eba 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -349,8 +349,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; From c410e246b6bed4dd1ef3f7f9e97fce523ba5394a Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Mar 2015 08:46:26 +0100 Subject: [PATCH 101/420] sched: Make sched entity usage tracking scale-invariant Apply frequency scale-invariance correction factor to usage tracking. Each segment of the running_avg_sum geometric series is now scaled by the current frequency so the utilization_avg_contrib of each entity will be invariant with frequency scaling. As a result, utilization_load_avg which is the sum of utilization_avg_contrib, becomes invariant too. So the usage level that is returned by get_cpu_usage(), stays relative to the max frequency as the cpu_capacity which is is compared against. Then, we want the keep the load tracking values in a 32-bit type, which implies that the max value of {runnable|running}_avg_sum must be lower than 2^32/88761=48388 (88761 is the max weigth of a task). As LOAD_AVG_MAX = 47742, arch_scale_freq_capacity() must return a value less than (48388/47742) << SCHED_CAPACITY_SHIFT = 1037 (SCHED_SCALE_CAPACITY = 1024). So we define the range to [0..SCHED_SCALE_CAPACITY] in order to avoid overflow. Signed-off-by: Morten Rasmussen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425455186-13451-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97d680264ab9e9..f8365b538453b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2283,6 +2283,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2311,7 +2313,7 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, int runnable, int running) @@ -2319,6 +2321,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2354,7 +2357,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += delta_w; if (running) - sa->running_avg_sum += delta_w; + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += delta_w; delta -= delta_w; @@ -2375,7 +2379,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += runnable_contrib; if (running) - sa->running_avg_sum += runnable_contrib; + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += runnable_contrib; } @@ -2383,7 +2388,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += delta; if (running) - sa->running_avg_sum += delta; + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += delta; return decayed; @@ -2491,8 +2497,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, - runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2567,6 +2573,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2578,7 +2585,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, cfs_rq->curr == se)) return; From f3ea47f91e683b0d565c82bc2edc66465f19336d Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:36 +0800 Subject: [PATCH 102/420] sched/fair: Remove rq's runnable avg The current rq->avg is not used at all since its merge into the kernel, and the code is in the scheduler's hot path, so remove it. Tested-by: Dietmar Eggemann Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-2-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit cd126afe838d7ea9b971cdea087fd498a7293c7f) Signed-off-by: Ricky Liang --- kernel/sched/debug.c | 7 +------ kernel/sched/fair.c | 25 ++++--------------------- kernel/sched/sched.h | 2 -- 3 files changed, 5 insertions(+), 29 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f384452ff589c2..eea37e7d08c26d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - if (!se) { - struct sched_avg *avg = &cpu_rq(cpu)->avg; - P(avg->runnable_avg_sum); - P(avg->avg_period); + if (!se) return; - } - PN(se->exec_start); PN(se->vruntime); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8365b538453b1..a4098fbcc83588 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2495,19 +2495,12 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) } } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, - runnable, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) @@ -2708,7 +2701,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, */ void idle_enter_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 1); } /* @@ -2718,7 +2710,6 @@ void idle_enter_fair(struct rq *this_rq) */ void idle_exit_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 0); } static int idle_balance(struct rq *this_rq); @@ -2727,7 +2718,6 @@ static int idle_balance(struct rq *this_rq); static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) {} @@ -4043,10 +4033,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_entity_load_avg(se, 1); } - if (!se) { - update_rq_runnable_avg(rq, rq->nr_running); + if (!se) add_nr_running(rq, 1); - } + hrtick_update(rq); } @@ -4104,10 +4093,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_entity_load_avg(se, 1); } - if (!se) { + if (!se) sub_nr_running(rq, 1); - update_rq_runnable_avg(rq, 1); - } + hrtick_update(rq); } @@ -5776,9 +5764,6 @@ static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) */ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) list_del_leaf_cfs_rq(cfs_rq); - } else { - struct rq *rq = rq_of(cfs_rq); - update_rq_runnable_avg(rq, rq->nr_running); } } @@ -7756,8 +7741,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); - - update_rq_runnable_avg(rq, 1); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e69afaf49e8eba..9a28f6118fcaff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,8 +565,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; - - struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* From 2838fe27a6922e134e8b3823b319fb86b47b1c02 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 27 Feb 2015 16:54:05 +0100 Subject: [PATCH 103/420] sched: Track group sched_entity usage contributions Add usage contribution tracking for group entities. Unlike se->avg.load_avg_contrib, se->avg.utilization_avg_contrib for group entities is the sum of se->avg.utilization_avg_contrib for all entities on the group runqueue. It is _not_ influenced in any way by the task group h_load. Hence it is representing the actual cpu usage of the group, not its intended load contribution which may differ significantly from the utilization on lightly utilized systems. Signed-off-by: Morten Rasmussen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index eea37e7d08c26d..e730fd648825b0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -89,8 +89,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); + P(se->avg.running_avg_sum); P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4098fbcc83588..b5dfa994a6f3c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2545,6 +2545,9 @@ static long __update_entity_utilization_avg_contrib(struct sched_entity *se) if (entity_is_task(se)) __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; return se->avg.utilization_avg_contrib - old_contrib; } From 40bd2e2d34608aaaf8af5624bd5c1c24a2cd5895 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 4 Mar 2015 08:48:47 +0100 Subject: [PATCH 104/420] sched: Calculate CPU's usage statistic and put it into struct sg_lb_stats::group_usage Monitor the usage level of each group of each sched_domain level. The usage is the portion of cpu_capacity_orig that is currently used on a CPU or group of CPUs. We use the utilization_load_avg to evaluate the usage level of each group. The utilization_load_avg only takes into account the running time of the CFS tasks on a CPU with a maximum value of SCHED_LOAD_SCALE when the CPU is fully utilized. Nevertheless, we must cap utilization_load_avg which can be temporally greater than SCHED_LOAD_SCALE after the migration of a task on this CPU and until the metrics are stabilized. The utilization_load_avg is in the range [0..SCHED_LOAD_SCALE] to reflect the running load on the CPU whereas the available capacity for the CFS task is in the range [0..cpu_capacity_orig]. In order to test if a CPU is fully utilized by CFS tasks, we have to scale the utilization in the cpu_capacity_orig range of the CPU to get the usage of the latter. The usage can then be compared with the available capacity (ie cpu_capacity) to deduct the usage level of a CPU. The frequency scaling invariance of the usage is not taken into account in this patch, it will be solved in another patch which will deal with frequency scaling invariance on the utilization_load_avg. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425455327-13508-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5dfa994a6f3c1..aa87f60be1eadf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4739,6 +4739,33 @@ static int select_idle_sibling(struct task_struct *p, int target) done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the usage with the capacity of the CPU that is available for CFS + * task (ie cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + */ +static int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -5868,6 +5895,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity_factor; unsigned int idle_cpus; @@ -6229,6 +6257,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; + sgs->group_usage += get_cpu_usage(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) From 399b0a222bf63088d5c794581139e930c4f483b3 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:37 +0800 Subject: [PATCH 105/420] sched/fair: Rewrite runnable load and utilization average tracking The idea of runnable load average (let runnable time contribute to weight) was proposed by Paul Turner and Ben Segall, and it is still followed by this rewrite. This rewrite aims to solve the following issues: 1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is updated at the granularity of an entity at a time, which results in the cfs_rq's load average is stale or partially updated: at any time, only one entity is up to date, all other entities are effectively lagging behind. This is undesirable. To illustrate, if we have n runnable entities in the cfs_rq, as time elapses, they certainly become outdated: t0: cfs_rq { e1_old, e2_old, ..., en_old } and when we update: t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old } t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old } ... We solve this by combining all runnable entities' load averages together in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based on the fact that if we regard the update as a function, then: w * update(e) = update(w * e) and update(e1) + update(e2) = update(e1 + e2), then w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2) therefore, by this rewrite, we have an entirely updated cfs_rq at the time we update it: t1: update cfs_rq { e1_new, e2_new, ..., en_new } t2: update cfs_rq { e1_new, e2_new, ..., en_new } ... 2. cfs_rq's load average is different between top rq->cfs_rq and other task_group's per CPU cfs_rqs in whether or not blocked_load_average contributes to the load. The basic idea behind runnable load average (the same for utilization) is that the blocked state is taken into account as opposed to only accounting for the currently runnable state. Therefore, the average should include both the runnable/running and blocked load averages. This rewrite does that. In addition, we also combine runnable/running and blocked averages of all entities into the cfs_rq's average, and update it together at once. This is based on the fact that: update(runnable) + update(blocked) = update(runnable + blocked) This significantly reduces the code as we don't need to separately maintain/update runnable/running load and blocked load. 3. How task_group entities' share is calculated is complex and imprecise. We reduce the complexity in this rewrite to allow a very simple rule: the task_group's load_avg is aggregated from its per CPU cfs_rqs's load_avgs. Then group entity's weight is simply proportional to its own cfs_rq's load_avg / task_group's load_avg. To illustrate, if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then, task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share To sum up, this rewrite in principle is equivalent to the current one, but fixes the issues described above. Turns out, it significantly reduces the code complexity and hence increases clarity and efficiency. In addition, the new averages are more smooth/continuous (no spurious spikes and valleys) and updated more consistently and quickly to reflect the load dynamics. As a result, we have less load tracking overhead, better performance, and especially better power efficiency due to more balanced load. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 9d89c257dfb9c51a532d69397f6eed75e5168c35) Signed-off-by: Ricky Liang --- include/linux/sched.h | 41 ++- kernel/sched/core.c | 3 - kernel/sched/debug.c | 41 ++- kernel/sched/fair.c | 630 ++++++++++++++---------------------------- kernel/sched/sched.h | 28 +- 5 files changed, 249 insertions(+), 494 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d7c1a7ac4756fa..4d55af38bf8f70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1076,29 +1076,24 @@ struct load_weight { u32 inv_weight; }; +/* + * The load_avg/util_avg accumulates an infinite geometric series. + * 1) load_avg factors the amount of time that a sched_entity is + * runnable on a rq into its weight. For cfs_rq, it is the aggregated + * such weights of all runnable and blocked sched_entities. + * 2) util_avg factors frequency scaling into the amount of time + * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. + * For cfs_rq, it is the aggregated such times of all runnable and + * blocked sched_entities. + * The 64 bit load_sum can: + * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with + * the highest weight (=88761) always runnable, we should not overflow + * 2) for entity, support any load.weight always runnable + */ struct sched_avg { - u64 last_runnable_update; - s64 decay_count; - /* - * utilization_avg_contrib describes the amount of time that a - * sched_entity is running on a CPU. It is based on running_avg_sum - * and is scaled in the range [0..SCHED_LOAD_SCALE]. - * load_avg_contrib described the amount of time that a sched_entity - * is runnable on a rq. It is based on both runnable_avg_sum and the - * weight of the task. - */ - unsigned long load_avg_contrib, utilization_avg_contrib; - /* - * These sums represent an infinite geometric series and so are bound - * above by 1024/(1-y). Thus we only need a u32 to store them for all - * choices of y < 1-2^(-32)*1024. - * running_avg_sum reflects the time that the sched_entity is - * effectively running on the CPU. - * runnable_avg_sum represents the amount of time a sched_entity is on - * a runqueue which includes the running time that is monitored by - * running_avg_sum. - */ - u32 runnable_avg_sum, avg_period, running_avg_sum; + u64 last_update_time, load_sum; + u32 util_sum, period_contrib; + unsigned long load_avg, util_avg; }; #ifdef CONFIG_SCHEDSTATS @@ -1164,7 +1159,7 @@ struct sched_entity { #endif #ifdef CONFIG_SMP - /* Per-entity load-tracking */ + /* Per entity load average tracking */ struct sched_avg avg; #endif }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1529a3583d04f1..f0c646c8d188f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1995,9 +1995,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e730fd648825b0..647f74108b7927 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -88,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif P(se->load.weight); #ifdef CONFIG_SMP - P(se->avg.runnable_avg_sum); - P(se->avg.running_avg_sum); - P(se->avg.avg_period); - P(se->avg.load_avg_contrib); - P(se->avg.utilization_avg_contrib); - P(se->avg.decay_count); + P(se->avg.load_avg); + P(se->avg.util_avg); #endif #undef PN #undef P @@ -207,21 +203,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", - cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", - cfs_rq->utilization_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "load_avg", + cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", + atomic_long_read(&cfs_rq->removed_load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", + atomic_long_read(&cfs_rq->removed_util_avg)); #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", - cfs_rq->tg_load_contrib); - SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", - cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", + cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", - atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -626,12 +620,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP - P(se.avg.runnable_avg_sum); - P(se.avg.running_avg_sum); - P(se.avg.avg_period); - P(se.avg.load_avg_contrib); - P(se.avg.utilization_avg_contrib); - P(se.avg.decay_count); + P(se.avg.load_sum); + P(se.avg.util_sum); + P(se.avg.load_avg); + P(se.avg.util_avg); + P(se.avg.last_update_time); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa87f60be1eadf..3680c758f35eab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update); - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; - /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -664,19 +659,31 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); -static inline void __update_task_entity_contrib(struct sched_entity *se); -static inline void __update_task_entity_utilization(struct sched_entity *se); +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) { - u32 slice; + struct sched_avg *sa = &p->se.avg; - slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; - p->se.avg.avg_period = slice; - __update_task_entity_contrib(&p->se); - __update_task_entity_utilization(&p->se); + sa->last_update_time = 0; + /* + * sched_avg's period_contrib should be strictly less then 1024, so + * we give it 1023 to make sure it is almost a period (1024us), and + * will definitely be update (after enqueue). + */ + sa->period_contrib = 1023; + sa->load_avg = scale_load_down(p->se.load.weight); + sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = LOAD_AVG_MAX; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } #else void init_task_runnable_average(struct task_struct *p) @@ -1566,8 +1573,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.avg_period; + delta = p->se.avg.load_sum / p->se.load.weight; + *period = LOAD_AVG_MAX; } p->last_sum_exec_runtime = runtime; @@ -2116,13 +2123,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) long tg_weight; /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * __update_cfs_rq_tg_load_contrib(). + * Use this CPU's real-time load instead of the last load contribution + * as the updating of the contribution is delayed, and we will use the + * the real-time load to calc the share. See update_tg_load_avg(). */ tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_contrib; - tg_weight += cfs_rq->load.weight; + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += cfs_rq->avg.load_avg; return tg_weight; } @@ -2132,7 +2139,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + load = cfs_rq->avg.load_avg; shares = (tg->shares * load); if (tg_weight) @@ -2194,14 +2201,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ - /* Precomputed fixed inverse multiplies for multiplication by y^n */ static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -2250,9 +2249,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val *= runnable_avg_yN_inv[local_n]; - /* We don't use SRR here since we always want to round down. */ - return val >> 32; + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; } /* @@ -2313,23 +2311,22 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, - struct sched_avg *sa, - int runnable, - int running) +static __always_inline int +__update_load_avg(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running) { u64 delta, periods; - u32 runnable_contrib; + u32 contrib; int delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); - delta = now - sa->last_runnable_update; + delta = now - sa->last_update_time; /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. */ if ((s64)delta < 0) { - sa->last_runnable_update = now; + sa->last_update_time = now; return 0; } @@ -2340,26 +2337,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, delta >>= 10; if (!delta) return 0; - sa->last_runnable_update = now; + sa->last_update_time = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->avg_period % 1024; + delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { - /* period roll-over */ decayed = 1; + /* how much left for next period will start over, we don't know yet */ + sa->period_contrib = 0; + /* * Now that we know we're crossing a period boundary, figure * out how much from delta we need to complete the current * period and accrue it. */ delta_w = 1024 - delta_w; - if (runnable) - sa->runnable_avg_sum += delta_w; + if (weight) + sa->load_sum += weight * delta_w; if (running) - sa->running_avg_sum += delta_w * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta_w; + sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; delta -= delta_w; @@ -2367,334 +2364,156 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, periods = delta / 1024; delta %= 1024; - sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, - periods + 1); - sa->running_avg_sum = decay_load(sa->running_avg_sum, - periods + 1); - sa->avg_period = decay_load(sa->avg_period, - periods + 1); + sa->load_sum = decay_load(sa->load_sum, periods + 1); + sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - runnable_contrib = __compute_runnable_contrib(periods); - if (runnable) - sa->runnable_avg_sum += runnable_contrib; + contrib = __compute_runnable_contrib(periods); + if (weight) + sa->load_sum += weight * contrib; if (running) - sa->running_avg_sum += runnable_contrib * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += runnable_contrib; + sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; } /* Remainder of delta accrued against u_0` */ - if (runnable) - sa->runnable_avg_sum += delta; + if (weight) + sa->load_sum += weight * delta; if (running) - sa->running_avg_sum += delta * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta; - - return decayed; -} - -/* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline u64 __synchronize_entity_decay(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 decays = atomic64_read(&cfs_rq->decay_counter); + sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; - decays -= se->avg.decay_count; - se->avg.decay_count = 0; - if (!decays) - return 0; + sa->period_contrib += delta; - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.utilization_avg_contrib = - decay_load(se->avg.utilization_avg_contrib, decays); + if (decayed) { + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + } - return decays; + return decayed; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) -{ - struct task_group *tg = cfs_rq->tg; - long tg_contrib; - - tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; - tg_contrib -= cfs_rq->tg_load_contrib; - - if (!tg_contrib) - return; - - if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic_long_add(tg_contrib, &tg->load_avg); - cfs_rq->tg_load_contrib += tg_contrib; - } -} - /* - * Aggregate cfs_rq runnable averages into an equivalent task_group - * representation for computing load contributions. + * Updating tg's load_avg is necessary before update_cfs_share (which is done) + * and effective_load (which is not done because it is too costly). */ -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { - struct task_group *tg = cfs_rq->tg; - long contrib; - - /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->avg_period + 1); - contrib -= cfs_rq->tg_runnable_contrib; + long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { - atomic_add(contrib, &tg->runnable_avg); - cfs_rq->tg_runnable_contrib += contrib; - } -} - -static inline void __update_group_entity_contrib(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg = cfs_rq->tg; - int runnable_avg; - - u64 contrib; - - contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div_u64(contrib, - atomic_long_read(&tg->load_avg) + 1); - - /* - * For group entities we need to compute a correction term in the case - * that they are consuming <1 cpu so that we would contribute the same - * load as a task of equal weight. - * - * Explicitly co-ordinating this measurement would be expensive, but - * fortunately the sum of each cpus contribution forms a usable - * lower-bound on the true value. - * - * Consider the aggregate of 2 contributions. Either they are disjoint - * (and the sum represents true value) or they are disjoint and we are - * understating by the aggregate of their overlap. - * - * Extending this to N cpus, for a given overlap, the maximum amount we - * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of - * cpus that overlap for this interval and w_i is the interval width. - * - * On a small machine; the first term is well-bounded which bounds the - * total error since w_i is a subset of the period. Whereas on a - * larger machine, while this first term can be larger, if w_i is the - * of consequential size guaranteed to see n_i*w_i quickly converge to - * our upper bound of 1-cpu. - */ - runnable_avg = atomic_read(&tg->runnable_avg); - if (runnable_avg < NICE_0_LOAD) { - se->avg.load_avg_contrib *= runnable_avg; - se->avg.load_avg_contrib >>= NICE_0_SHIFT; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) {} -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) {} -static inline void __update_group_entity_contrib(struct sched_entity *se) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_task_entity_contrib(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.avg_period + 1); - se->avg.load_avg_contrib = scale_load(contrib); -} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - long old_contrib = se->avg.load_avg_contrib; + int decayed; + struct sched_avg *sa = &cfs_rq->avg; - if (entity_is_task(se)) { - __update_task_entity_contrib(se); - } else { - __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); - __update_group_entity_contrib(se); + if (atomic_long_read(&cfs_rq->removed_load_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + sa->load_avg = max_t(long, sa->load_avg - r, 0); + sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); } - return se->avg.load_avg_contrib - old_contrib; -} - - -static inline void __update_task_entity_utilization(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); - contrib /= (se->avg.avg_period + 1); - se->avg.utilization_avg_contrib = scale_load(contrib); -} + if (atomic_long_read(&cfs_rq->removed_util_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - + ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + } -static long __update_entity_utilization_avg_contrib(struct sched_entity *se) -{ - long old_contrib = se->avg.utilization_avg_contrib; + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL); - if (entity_is_task(se)) - __update_task_entity_utilization(se); - else - se->avg.utilization_avg_contrib = - group_cfs_rq(se)->utilization_load_avg; - - return se->avg.utilization_avg_contrib - old_contrib; -} +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif -static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, - long load_contrib) -{ - if (likely(load_contrib < cfs_rq->blocked_load_avg)) - cfs_rq->blocked_load_avg -= load_contrib; - else - cfs_rq->blocked_load_avg = 0; + return decayed; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - -/* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta, utilization_delta; int cpu = cpu_of(rq_of(cfs_rq)); - u64 now; + u64 now = cfs_rq_clock_task(cfs_rq); /* - * For a group entity we need to use their owned cfs_rq_clock_task() in - * case they are the parent of a throttled hierarchy. + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration */ - if (entity_is_task(se)) - now = cfs_rq_clock_task(cfs_rq); - else - now = cfs_rq_clock_task(group_cfs_rq(se)); + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); - if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, - cfs_rq->curr == se)) - return; - - contrib_delta = __update_entity_load_avg_contrib(se); - utilization_delta = __update_entity_utilization_avg_contrib(se); - - if (!update_cfs_rq) - return; - - if (se->on_rq) { - cfs_rq->runnable_load_avg += contrib_delta; - cfs_rq->utilization_load_avg += utilization_delta; - } else { - subtract_blocked_load_contrib(cfs_rq, -contrib_delta); - } + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); } -/* - * Decay the load contributed by all blocked children and account this so that - * their contribution may appropriately discounted when they wake up. - */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +/* Add the load generated by se into cfs_rq's load average */ +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 now = cfs_rq_clock_task(cfs_rq) >> 20; - u64 decays; - - decays = now - cfs_rq->last_decay; - if (!decays && !force_update) - return; + struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + int migrated = 0, decayed; - if (atomic_long_read(&cfs_rq->removed_load)) { - unsigned long removed_load; - removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); - subtract_blocked_load_contrib(cfs_rq, removed_load); + if (sa->last_update_time == 0) { + sa->last_update_time = now; + migrated = 1; } - - if (decays) { - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); - cfs_rq->last_decay = now; + else { + __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); } - __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); -} + decayed = update_cfs_rq_load_avg(now, cfs_rq); -/* Add the load generated by se into cfs_rq's child load-average */ -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) -{ - /* - * We track migrations using entity decay_count <= 0, on a wake-up - * migration we use a negative decay count to track the remote decays - * accumulated while sleeping. - * - * Newly forked tasks are enqueued with se->avg.decay_count == 0, they - * are seen by enqueue_entity_load_avg() as a migration with an already - * constructed load_avg_contrib. - */ - if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); - if (se->avg.decay_count) { - /* - * In a wake-up migration we have to approximate the - * time sleeping. This is because we can't synchronize - * clock_task between the two cpus, and it is not - * guaranteed to be read-safe. Instead, we can - * approximate this using our carried decays, which are - * explicitly atomically readable. - */ - se->avg.last_runnable_update -= (-se->avg.decay_count) - << 20; - update_entity_load_avg(se, 0); - /* Indicate that we're now synchronized and on-rq */ - se->avg.decay_count = 0; - } - wakeup = 0; - } else { - __synchronize_entity_decay(se); + if (migrated) { + cfs_rq->avg.load_avg += sa->load_avg; + cfs_rq->avg.load_sum += sa->load_sum; + cfs_rq->avg.util_avg += sa->util_avg; + cfs_rq->avg.util_sum += sa->util_sum; } - /* migrated tasks did not contribute to our blocked load */ - if (wakeup) { - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - update_entity_load_avg(se, 0); - } - - cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !wakeup); + if (decayed || migrated) + update_tg_load_avg(cfs_rq, 0); } /* - * Remove se's load from this cfs_rq child load-average, if the entity is - * transitioning to a blocked state we track its projected decay using - * blocked_load_avg. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) +void remove_entity_load_avg(struct sched_entity *se) { - update_entity_load_avg(se, 1); - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !sleep); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + +#ifndef CONFIG_64BIT + u64 last_update_time_copy; - cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; - if (sleep) { - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - } /* migrations, e.g. sleep=0 leave decay_count == 0 */ + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); +#else + last_update_time = cfs_rq->avg.last_update_time; +#endif + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } /* @@ -2719,16 +2538,10 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) {} -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) {} -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) { @@ -2861,7 +2674,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -2936,7 +2749,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + update_load_avg(se, 1); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -3026,7 +2839,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_entity_load_avg(se, 1); + update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3126,7 +2939,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev, 1); + update_load_avg(prev, 0); } cfs_rq->curr = NULL; } @@ -3142,8 +2955,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq, 1); + update_load_avg(curr, 1); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4032,8 +3844,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } if (!se) @@ -4092,8 +3904,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } if (!se) @@ -4232,7 +4044,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = this_rq->cfs.avg.load_avg; unsigned long pending_updates; /* @@ -4278,7 +4090,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = this_rq->cfs.avg.load_avg; /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4289,7 +4101,7 @@ void update_cpu_load_active(struct rq *this_rq) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->cfs.runnable_load_avg; + return cpu_rq(cpu)->cfs.avg.load_avg; } /* @@ -4334,7 +4146,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.runnable_load_avg; + unsigned long load_avg = rq->cfs.avg.load_avg; if (nr_running) return load_avg / nr_running; @@ -4453,7 +4265,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->load.weight + wl; + w = se->my_q->avg.load_avg + wl; /* * wl = S * s'_i; see (2) @@ -4474,7 +4286,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * wl = dw_i = S * (s'_i - s_i); see (3) */ - wl -= se->load.weight; + wl -= se->avg.load_avg; /* * Recursively apply this logic to all parent groups to compute @@ -4544,14 +4356,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -4744,12 +4556,12 @@ static int select_idle_sibling(struct task_struct *p, int target) * tasks. The unit of the return value must be the one of capacity so we can * compare the usage with the capacity of the CPU that is available for CFS * task (ie cpu_capacity). - * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in avg_period and running_load_avg or just + * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running * time. So we need to check that the usage stays into the range * [0..cpu_capacity_orig] and cap if necessary. @@ -4758,7 +4570,7 @@ static int select_idle_sibling(struct task_struct *p, int target) */ static int get_cpu_usage(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); if (usage >= SCHED_LOAD_SCALE) @@ -4870,26 +4682,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void -migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - /* - * Load tracking: accumulate removed load so that it can be processed - * when we next update owning cfs_rq under rq->lock. Tasks contribute - * to blocked load iff they have a positive decay-count. It can never - * be negative here since on-rq tasks have decay-count == 0. + * We are supposed to update the task to "current" time, then its up to date + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting + * what current time is, so simply throw away the out-of-date time. This + * will result in the wakee task is less decayed, but giving the wakee more + * load sounds not bad. */ - if (se->avg.decay_count) { - se->avg.decay_count = -__synchronize_entity_decay(se); - atomic_long_add(se->avg.load_avg_contrib, - &cfs_rq->removed_load); - } + remove_entity_load_avg(&p->se); + + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; + p->se.exec_start = 0; } #endif /* CONFIG_SMP */ @@ -5767,36 +5575,6 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) -{ - struct sched_entity *se = tg->se[cpu]; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - return; - - update_cfs_rq_blocked_load(cfs_rq, 1); - - if (se) { - update_entity_load_avg(se, 1); - /* - * We pivot on our runnable average having decayed to zero for - * list removal. This generally implies that all our children - * have also been removed (modulo rounding error or bandwidth - * control); however, such cases are rare and we can fix these - * at enqueue. - * - * TODO: fix up out-of-order children on enqueue. - */ - if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) - list_del_leaf_cfs_rq(cfs_rq); - } -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5805,19 +5583,19 @@ static void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* - * Note: We may want to consider periodically releasing - * rq->lock about these updates so that creating many task - * groups does not result in continually extending hold time. - */ - __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - } + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + update_tg_load_avg(cfs_rq, 0); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -5845,14 +5623,13 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->h_load = cfs_rq->avg.load_avg; cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg_contrib, - cfs_rq->runnable_load_avg + 1); + load = div64_ul(load * se->avg.load_avg, cfs_rq->avg.load_avg + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -5864,8 +5641,8 @@ static unsigned long task_h_load(struct task_struct *p) struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); - return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, - cfs_rq->runnable_load_avg + 1); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq->avg.load_avg + 1); } #else static inline void update_blocked_averages(int cpu) @@ -5874,7 +5651,7 @@ static inline void update_blocked_averages(int cpu) static unsigned long task_h_load(struct task_struct *p) { - return p->se.avg.load_avg_contrib; + return p->se.avg.load_avg; } #endif @@ -7871,15 +7648,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) } #ifdef CONFIG_SMP - /* - * Remove our load from contribution when we leave sched_fair - * and ensure we don't carry in an old decay_count if we - * switch back. - */ - if (se->avg.decay_count) { - __synchronize_entity_decay(se); - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - } + /* Catch up with the cfs_rq and remove our load when we leave */ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + + cfs_rq->avg.load_avg = + max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = + max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = + max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = + max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); #endif } @@ -7936,8 +7716,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP - atomic64_set(&cfs_rq->decay_counter, 1); - atomic_long_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load_avg, 0); + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } @@ -7982,14 +7762,14 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; + #ifdef CONFIG_SMP - /* - * migrate_task_rq_fair() will have removed our previous - * contribution, but we must synchronize for ongoing future - * decay. - */ - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + /* Virtually synchronize task with its new cfs_rq */ + p->se.avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += p->se.avg.load_avg; + cfs_rq->avg.load_sum += p->se.avg.load_sum; + cfs_rq->avg.util_avg += p->se.avg.util_avg; + cfs_rq->avg.util_sum += p->se.avg.util_sum; #endif } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a28f6118fcaff..c3077e70f8d3e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -224,7 +224,6 @@ struct task_group { #ifdef CONFIG_SMP atomic_long_t load_avg; - atomic_t runnable_avg; #endif #endif @@ -345,27 +344,18 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * CFS Load tracking - * Under CFS, load is tracked on a per-entity basis and aggregated up. - * This allows for the description of both thread and group usage (in - * the FAIR_GROUP_SCHED case). - * runnable_load_avg is the sum of the load_avg_contrib of the - * sched_entities on the rq. - * blocked_load_avg is similar to runnable_load_avg except that its - * the blocked sched_entities on the rq. - * utilization_load_avg is the sum of the average running time of the - * sched_entities on the rq. + * CFS load tracking */ - unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; - atomic64_t decay_counter; - u64 last_decay; - atomic_long_t removed_load; - + struct sched_avg avg; #ifdef CONFIG_FAIR_GROUP_SCHED - /* Required to track per-cpu representation of a task_group */ - u32 tg_runnable_contrib; - unsigned long tg_load_contrib; + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED /* * h_load = weight * f(tg) * From 0ce955cdcaab5173cab79c962cfbb3a23d45799d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 15 Jul 2015 08:04:38 +0800 Subject: [PATCH 106/420] sched/fair: Implement update_blocked_averages() for CONFIG_FAIR_GROUP_SCHED=n The load and the utilization of idle CPUs must be updated periodically in order to decay the blocked part. If CONFIG_FAIR_GROUP_SCHED is not set, the load and util of idle cpus are not decayed and stay at the values set before becoming idle. Signed-off-by: Vincent Guittot Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/1436918682-4971-4-git-send-email-yuyang.du@intel.com [ Fixed up the SOB chain. ] Signed-off-by: Ingo Molnar (cherry picked from commit 6c1d47c0827304949e0eb9479f4d587f226fac8b) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3680c758f35eab..9ec6705e538a7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5647,6 +5647,14 @@ static unsigned long task_h_load(struct task_struct *p) #else static inline void update_blocked_averages(int cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static unsigned long task_h_load(struct task_struct *p) From 63633b31abd7adf94b2a161323a6a21368ae5a1f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:39 +0800 Subject: [PATCH 107/420] sched/fair: Init cfs_rq's sched_entity load average The runnable load and utilization averages of cfs_rq's sched_entity were not initiated. Like done to a task, give new cfs_rq' sched_entity start values to heavy its load in infant time. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-5-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 540247fb5ddf6d2364f90387fa1f8f428d15e683) Signed-off-by: Ricky Liang --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 11 ++++++----- kernel/sched/sched.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0c646c8d188f6..f80c9450ff5c94 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2300,7 +2300,7 @@ void wake_up_new_task(struct task_struct *p) #endif /* Initialize new task's runnable average */ - init_task_runnable_average(p); + init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ec6705e538a7b..184bab98439668 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -667,10 +667,10 @@ static unsigned long task_h_load(struct task_struct *p); #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ -/* Give new task start runnable values to heavy its load in infant time */ -void init_task_runnable_average(struct task_struct *p) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) { - struct sched_avg *sa = &p->se.avg; + struct sched_avg *sa = &se->avg; sa->last_update_time = 0; /* @@ -679,14 +679,14 @@ void init_task_runnable_average(struct task_struct *p) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(p->se.load.weight); + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } #else -void init_task_runnable_average(struct task_struct *p) +void init_entity_runnable_average(struct sched_entity *se) { } #endif @@ -7829,6 +7829,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_entity_runnable_average(se); } return 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c3077e70f8d3e8..b191ac97e3b521 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1231,7 +1231,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void init_task_runnable_average(struct task_struct *p); +extern void init_entity_runnable_average(struct sched_entity *se); static inline void add_nr_running(struct rq *rq, unsigned count) { From 66a5fcc813c063a9e8c5c6ba2efa39588a208571 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:40 +0800 Subject: [PATCH 108/420] sched/fair: Remove task and group entity load when they are dead When task exits or group is destroyed, the entity's load should be removed from its parent cfs_rq's load. Otherwise, it will take time for the parent cfs_rq to decay the dead entity's load to 0, which is not desired. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-6-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 1269557889b477e3e43ab99a21035ddf8f7cea4d) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 184bab98439668..8fe62a8fef67ac 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4699,6 +4699,11 @@ static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; } + +static void task_dead_fair(struct task_struct *p) +{ + remove_entity_load_avg(&p->se); +} #endif /* CONFIG_SMP */ static unsigned long @@ -7791,8 +7796,11 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) + if (tg->se) { + if (tg->se[i]) + remove_entity_load_avg(tg->se[i]); kfree(tg->se[i]); + } } kfree(tg->cfs_rq); @@ -7979,6 +7987,7 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, + .task_dead = task_dead_fair, #endif .set_curr_task = set_curr_task_fair, From 17e4c6a12d8d3f3dc9a0b601b4b3bfb9a4317a46 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:41 +0800 Subject: [PATCH 109/420] sched/fair: Provide runnable_load_avg back to cfs_rq The cfs_rq's load_avg is composed of runnable_load_avg and blocked_load_avg. Before this series, sometimes the runnable_load_avg is used, and sometimes the load_avg is used. Completely replacing all uses of runnable_load_avg with load_avg may be too big a leap, i.e., the blocked_load_avg is concerned to result in overrated load. Therefore, we get runnable_load_avg back. The new cfs_rq's runnable_load_avg is improved to be updated with all of the runnable sched_eneities at the same time, so the one sched_entity updated and the others stale problem is solved. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-7-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 139622343ef31941effc6de6a5a9320371a00e62) Signed-off-by: Ricky Liang --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 2 ++ 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 647f74108b7927..edb1a41c686a9d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -205,6 +205,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fe62a8fef67ac..55947bac824fef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2313,7 +2313,7 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); */ static __always_inline int __update_load_avg(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running) + unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta, periods; u32 contrib; @@ -2353,8 +2353,11 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; - if (weight) + if (weight) { sa->load_sum += weight * delta_w; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta_w; + } if (running) sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; @@ -2365,19 +2368,29 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta %= 1024; sa->load_sum = decay_load(sa->load_sum, periods + 1); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods + 1); + } sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); - if (weight) + if (weight) { sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } if (running) sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; } /* Remainder of delta accrued against u_0` */ - if (weight) + if (weight) { sa->load_sum += weight * delta; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta; + } if (running) sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; @@ -2385,6 +2398,10 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (decayed) { sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + } sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; } @@ -2432,7 +2449,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL); + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -2454,7 +2471,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); @@ -2474,11 +2491,15 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) } else { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); } decayed = update_cfs_rq_load_avg(now, cfs_rq); + cfs_rq->runnable_load_avg += sa->load_avg; + cfs_rq->runnable_load_sum += sa->load_sum; + if (migrated) { cfs_rq->avg.load_avg += sa->load_avg; cfs_rq->avg.load_sum += sa->load_sum; @@ -2490,6 +2511,18 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) update_tg_load_avg(cfs_rq, 0); } +/* Remove the runnable load generated by se from cfs_rq's runnable load average */ +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_avg(se, 1); + + cfs_rq->runnable_load_avg = + max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + cfs_rq->runnable_load_sum = + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); +} + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2511,7 +2544,7 @@ void remove_entity_load_avg(struct sched_entity *se) last_update_time = cfs_rq->avg.last_update_time; #endif - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -2541,6 +2574,8 @@ static int idle_balance(struct rq *this_rq); static inline void update_load_avg(struct sched_entity *se, int update_tg) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) @@ -2749,7 +2784,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_load_avg(se, 1); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -7663,7 +7698,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se); + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b191ac97e3b521..73e27ab1a00c9b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -347,6 +347,8 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; #endif From 7b0c9a97e9b4935e039361032b8be560b9257982 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 15 Jul 2015 08:04:42 +0800 Subject: [PATCH 110/420] sched/fair: Clean up load average references For cfs_rq, we have load.weight, runnable_load_avg, and load_avg. Clean up how they are used: - First, as group sched_entity already largely uses load_avg, we now expand to use load_avg in all cases. - Second, for CPU-wide load balancing, we choose to use runnable_load_avg in all cases, which is the same as before this series. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: arjan@linux.intel.com Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: fengguang.wu@intel.com Cc: len.brown@intel.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: rafael.j.wysocki@intel.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1436918682-4971-8-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 7ea241afbf4924c58d41078599f7a32ba49fb985) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55947bac824fef..df67f9f0c53747 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -685,6 +685,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->util_sum = LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -2129,7 +2132,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->avg.load_avg; + tg_weight += cfs_rq_load_avg(cfs_rq); return tg_weight; } @@ -2139,7 +2142,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->avg.load_avg; + load = cfs_rq_load_avg(cfs_rq); shares = (tg->shares * load); if (tg_weight) @@ -2567,6 +2570,16 @@ void idle_exit_fair(struct rq *this_rq) { } +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->runnable_load_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; +} + static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ @@ -4058,6 +4071,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +} + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -4079,7 +4098,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.avg.load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long pending_updates; /* @@ -4125,7 +4144,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.avg.load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4133,12 +4152,6 @@ void update_cpu_load_active(struct rq *this_rq) __update_cpu_load(this_rq, load, 1); } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->cfs.avg.load_avg; -} - /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -4181,7 +4194,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.avg.load_avg; + unsigned long load_avg = weighted_cpuload(cpu); if (nr_running) return load_avg / nr_running; @@ -4300,7 +4313,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->avg.load_avg + wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) @@ -5663,13 +5676,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->avg.load_avg; + cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg, cfs_rq->avg.load_avg + 1); + load = div64_ul(load * se->avg.load_avg, + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -5682,7 +5696,7 @@ static unsigned long task_h_load(struct task_struct *p) update_cfs_rq_h_load(cfs_rq); return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, - cfs_rq->avg.load_avg + 1); + cfs_rq_load_avg(cfs_rq) + 1); } #else static inline void update_blocked_averages(int cpu) From ec9311bba1326f4ad70f3138efae8c8e898fb958 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 10 Aug 2015 18:02:55 +0900 Subject: [PATCH 111/420] sched: Ensure a task has a non-normalized vruntime when returning back to CFS Current code ensures that a task has a normalized vruntime when switching away from the fair class, but it does not ensure the task has a non-normalized vruntime when switching back to the fair class. This is an example breaking this consistency: 1. a task is in fair class and !queued 2. changes its class to RT class (still !queued) 3. changes its class to fair class again (still !queued) Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439197375-27927-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit 7855a35ac07a350e2cd26f09568a6d8e372be358) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df67f9f0c53747..a2ab03bef9f205 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7730,16 +7730,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *se = &p->se; + +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) + + if (!task_on_rq_queued(p)) { + + /* + * Ensure the task has a non-normalized vruntime when it is switched + * back to the fair class with !queued, so that enqueue_entity() at + * wake-up time will do the right thing. + * + * If it's queued, then the enqueue_entity(.flags=0) makes the task + * has non-normalized vruntime, if it's !queued, then it still has + * normalized vruntime. + */ + if (p->state != TASK_RUNNING) + se->vruntime += cfs_rq_of(se)->min_vruntime; return; + } /* * We were most likely switched from sched_rt, so From 1cc76fc0396c1c55b1998d7b50f376762e06c978 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:34 +0200 Subject: [PATCH 112/420] sched: Fix a race between __kthread_bind() and sched_setaffinity() Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it: __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the bug by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org Signed-off-by: Ingo Molnar (cherry picked from commit 25834c73f93af7f0712c98ca4593691592e6b360) Signed-off-by: Punit Agrawal --- include/linux/kthread.h | 1 + include/linux/sched.h | 7 ------- kernel/kthread.c | 20 +++++++++++++++++--- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++---- kernel/workqueue.c | 6 ++---- 5 files changed, 52 insertions(+), 18 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 13d55206ccf67a..869b21dcf503a8 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), }) void kthread_bind(struct task_struct *k, unsigned int cpu); +void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_stop(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d55af38bf8f70..9e08d0a8332def 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2097,13 +2097,6 @@ static inline void calc_load_enter_idle(void) { } static inline void calc_load_exit_idle(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#ifndef CONFIG_CPUMASK_OFFSTACK -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ - return set_cpus_allowed_ptr(p, &new_mask); -} -#endif - /* * Do not use outside of architecture code which knows its limitations. * diff --git a/kernel/kthread.c b/kernel/kthread.c index 10e489c448fe4e..7c40a189becc5e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f80c9450ff5c94..cfad7d47304d49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1168,6 +1168,8 @@ static int migration_cpu_stop(void *data) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + lockdep_assert_held(&p->pi_lock); + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); @@ -1184,7 +1186,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -1193,6 +1196,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -1222,6 +1234,11 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1600,6 +1617,15 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + #endif /* CONFIG_SMP */ static void @@ -4266,7 +4292,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4805,7 +4831,8 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4831,7 +4858,8 @@ void init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 09b685daee3d8c..0305b69b6fa597 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1698,9 +1698,7 @@ static struct worker *create_worker(struct worker_pool *pool) goto fail; set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -4068,7 +4066,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } From cb8576f3f89d9aa9fe08d2fbb9e360bd39193ef1 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:56 +0900 Subject: [PATCH 113/420] sched/fair: Factor out the {at,de}taching of the per entity load {to,from} the runqueue Currently we open-code the addition/subtraction of the per entity load to/from the runqueue, factor this out into helper functions. Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit a05e8c51ff097ff73ec2947631d9102283545f7c) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 75 ++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2ab03bef9f205..b56ca55e6a6424 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2466,33 +2466,52 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - int cpu = cpu_of(rq_of(cfs_rq)); u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); } +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + se->avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; +} + +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); +} + /* Add the load generated by se into cfs_rq's load average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; u64 now = cfs_rq_clock_task(cfs_rq); - int migrated = 0, decayed; + int migrated, decayed; - if (sa->last_update_time == 0) { - sa->last_update_time = now; - migrated = 1; - } - else { + migrated = !sa->last_update_time; + if (!migrated) { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); @@ -2503,12 +2522,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) { - cfs_rq->avg.load_avg += sa->load_avg; - cfs_rq->avg.load_sum += sa->load_sum; - cfs_rq->avg.util_avg += sa->util_avg; - cfs_rq->avg.util_sum += sa->util_sum; - } + if (migrated) + attach_entity_load_avg(cfs_rq, se); if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); @@ -2523,7 +2538,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* @@ -2591,6 +2606,11 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + static inline int idle_balance(struct rq *rq) { return 0; @@ -7709,25 +7729,10 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - - cfs_rq->avg.load_avg = - max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = - max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = - max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = - max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); -#endif + detach_entity_load_avg(cfs_rq, se); } -/* - * We switched to the sched_fair class. - */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -7840,14 +7845,8 @@ static void task_move_group_fair(struct task_struct *p, int queued) cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; -#ifdef CONFIG_SMP /* Virtually synchronize task with its new cfs_rq */ - p->se.avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += p->se.avg.load_avg; - cfs_rq->avg.load_sum += p->se.avg.load_sum; - cfs_rq->avg.util_avg += p->se.avg.util_avg; - cfs_rq->avg.util_sum += p->se.avg.util_sum; -#endif + attach_entity_load_avg(cfs_rq, se); } } From a087e1195c79c6352ed3eccec3040ca584a49bb1 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:57 +0900 Subject: [PATCH 114/420] sched/fair: Have task_move_group_fair() unconditionally add the entity load to the runqueue Currently we conditionally add the entity load to the rq when moving the task between cgroups. This doesn't make sense as we always 'migrate' the task between cgroups, so we should always migrate the load too. [ The history here is that we used to only migrate the blocked load which was only meaningfull when !queued. ] Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit 50a2a3b246149d041065a67ccb3e98145f780a2f) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b56ca55e6a6424..c60602bf88333b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7841,13 +7841,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!queued) { - cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); + if (!queued) se->vruntime += cfs_rq->min_vruntime; - /* Virtually synchronize task with its new cfs_rq */ - attach_entity_load_avg(cfs_rq, se); - } + /* Virtually synchronize task with its new cfs_rq */ + attach_entity_load_avg(cfs_rq, se); } void free_fair_sched_group(struct task_group *tg) From 31fd4fd1ced3f8c1486beb8e5feb66a1b6bd8fff Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:58 +0900 Subject: [PATCH 115/420] sched/fair: Have task_move_group_fair() also detach entity load from the old runqueue Since we attach the entity load to the new runqueue, we should also detatch the entity load from the old runqueue, otherwise load can accumulate. Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-4-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit 1746babbb15594ba2d8d8196589bbbc2b5ff51c9) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c60602bf88333b..14609fc9635209 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7837,8 +7837,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) queued = 1; + cfs_rq = cfs_rq_of(se); if (!queued) - se->vruntime -= cfs_rq_of(se)->min_vruntime; + se->vruntime -= cfs_rq->min_vruntime; + + /* Synchronize task with its prev cfs_rq */ + detach_entity_load_avg(cfs_rq, se); set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; cfs_rq = cfs_rq_of(se); From 9f04824502ef58d2e0de546e636083806c233075 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:59 +0900 Subject: [PATCH 116/420] sched/fair: Fix switched_to_fair()'s per entity load tracking Where switched_from_fair() will remove the entity's load from the runqueue, switched_to_fair() does not currently add it back. This means that when a task leaves the fair class for a short duration; say because of PI; we loose its load contribution. This can ripple forward and disturb the load tracking because other operations (enqueue, dequeue) assume its factored in. Only once the runqueue empties will the load tracking recover. When we add it back in, age the per entity average to match up with the runqueue age. This has the obvious problem that if the task leaves the fair class for a significant time, the load will age to 0. Employ the normal migration rule for inter-runqueue moves in task_move_group_fair(). Again, there is the obvious problem of the task migrating while not in the fair class. The alternative solution would be to to omit the chunk in attach_entity_load_avg(), which would effectively reset the timestamp and use whatever avg there was. Signed-off-by: Byungchul Park [ Rewrote the changelog and comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit 6efdb105d392da3ad5cb4ef951aed373cd049813) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14609fc9635209..e42fe2c035a080 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2483,6 +2483,20 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * If we got migrated (either between CPUs or between cgroups) we'll + * have aged the average right before clearing @last_update_time. + */ + if (se->avg.last_update_time) { + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, 0, 0, NULL); + + /* + * XXX: we could have just aged the entire load away if we've been + * absent from the fair class for too long. + */ + } + se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; @@ -7745,6 +7759,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif + /* Synchronize task with its cfs_rq */ + attach_entity_load_avg(cfs_rq_of(&p->se), &p->se); + if (!task_on_rq_queued(p)) { /* @@ -7844,6 +7861,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) /* Synchronize task with its prev cfs_rq */ detach_entity_load_avg(cfs_rq, se); set_task_rq(p, task_cpu(p)); + +#ifdef CONFIG_SMP + /* Tell se's cfs_rq has been changed -- migrated */ + p->se.avg.last_update_time = 0; +#endif + se->depth = se->parent ? se->parent->depth + 1 : 0; cfs_rq = cfs_rq_of(se); if (!queued) From 412998e27f77bc7bb9bfd64d8db52b0f648d4601 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2015 16:10:59 +0200 Subject: [PATCH 117/420] sched/fair: Make the entity load aging on attaching tunable In case there are problems with the aging on attach, provide a debug knob to turn it off. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit a9280514bf1e54775b8d7cd93d87c05c2b5273e6) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 4 ++++ kernel/sched/features.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e42fe2c035a080..cd25e96e727b76 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2483,6 +2483,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + if (!sched_feat(ATTACH_AGE_LOAD)) + goto skip_aging; + /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. @@ -2497,6 +2500,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ } +skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe65f..44e2b9f1c9cb87 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -60,6 +60,8 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via From f9f666e834efbe48078d62622ec95d72ccc843b4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:22:00 +0900 Subject: [PATCH 118/420] sched/fair: Unify switched_{from,to}_fair() and task_move_group_fair() By observing that switched_from_fair() detaches from a runqueue, and switched_to_fair() attaches to a runqueue, we can see that task_move_group_fair() is one followed by the other with flipping the runqueue in between. Therefore extract all the common bits and implement all three functions in terms of them. This should fix a few corner cases wrt. vruntime normalization; where, when we take a task off of a runqueue we convert to an approximation of lag by subtracting min_vruntime, and when placing a task on the a runqueue to the reverse. Suggested-by: Peter Zijlstra Signed-off-by: Byungchul Park [peterz: Changelog] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-6-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit daa59407b558e6e621e9081a308d5db3ef991fb6) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 129 ++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 77 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cd25e96e727b76..4676acd4c1c456 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7724,21 +7724,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) +static inline bool vruntime_normalized(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when it's - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. + * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, + * the dequeue_entity(.flags=0) will already have normalized the + * vruntime. + */ + if (p->on_rq) + return true; + + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: * - * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping will it still have non-normalized vruntime. + * - A forked child which is waiting for being woken up by + * wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7751,9 +7769,10 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) detach_entity_load_avg(cfs_rq, se); } -static void switched_to_fair(struct rq *rq, struct task_struct *p) +static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -7764,33 +7783,32 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq_of(&p->se), &p->se); + attach_entity_load_avg(cfs_rq, se); + + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; +} - if (!task_on_rq_queued(p)) { +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + if (task_on_rq_queued(p)) { /* - * Ensure the task has a non-normalized vruntime when it is switched - * back to the fair class with !queued, so that enqueue_entity() at - * wake-up time will do the right thing. - * - * If it's queued, then the enqueue_entity(.flags=0) makes the task - * has non-normalized vruntime, if it's !queued, then it still has - * normalized vruntime. + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. */ - if (p->state != TASK_RUNNING) - se->vruntime += cfs_rq_of(se)->min_vruntime; - return; + if (rq->curr == p) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); } - - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_curr(rq); - else - check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. @@ -7827,57 +7845,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int queued) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq; - - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !queued, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - queued = 1; - - cfs_rq = cfs_rq_of(se); - if (!queued) - se->vruntime -= cfs_rq->min_vruntime; - - /* Synchronize task with its prev cfs_rq */ - detach_entity_load_avg(cfs_rq, se); + detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif - - se->depth = se->parent ? se->parent->depth + 1 : 0; - cfs_rq = cfs_rq_of(se); - if (!queued) - se->vruntime += cfs_rq->min_vruntime; - - /* Virtually synchronize task with its new cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_task_cfs_rq(p); } void free_fair_sched_group(struct task_group *tg) From 4022833427881d3a7c88b227485f68f29cf3aa81 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Aug 2015 17:13:55 +0200 Subject: [PATCH 119/420] sched/core: Remove unused argument from sched_class::task_move_group The previous patches made the second argument go unused, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar (cherry picked from commit bc54da2176cd38cedea767eff637229a191a2383) Signed-off-by: Javi Merino --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cfad7d47304d49..baced1b9698a9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7494,7 +7494,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4676acd4c1c456..dad728758b022c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7843,7 +7843,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int queued) +static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 73e27ab1a00c9b..2bc7b0de197fcf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1152,7 +1152,7 @@ struct sched_class { void (*update_curr) (struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p); #endif }; From d8affd6b53d71ebe48015930f409b7c70d381f62 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 31 Aug 2015 15:12:56 +0300 Subject: [PATCH 120/420] sched/core: Delete PF_EXITING checks from cpu_cgroup_exit() callback cgroup_exit() is not called from copy_process() after commit: e8604cb43690 ("cgroup: fix spurious lockdep warning in cgroup_exit()") from do_exit(). So this check is useless and the comment is obsolete. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55E444C8.3020402@odin.com Signed-off-by: Ingo Molnar (cherry picked from commit 446685e9bfa11174332fbb0b3218b37015fbf4ff) Signed-off-by: Javi Merino --- kernel/sched/core.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index baced1b9698a9f..59768a42c62c0a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7952,14 +7952,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, struct cgroup_subsys_state *old_css, struct task_struct *task) { - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - sched_move_task(task); } From 56c2a84f4a4c102b960b4081747b376a26bdab69 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 3 Aug 2015 11:55:50 +0200 Subject: [PATCH 121/420] sched/fair: Fix nohz.next_balance update Since commit: d4573c3e1c99 ("sched: Improve load balancing in the presence of idle CPUs") the ILB CPU starts with the idle load balancing of other idle CPUs and finishes with itself in order to speed up the spread of tasks in all idle CPUs. The this_rq->next_balance is still used in nohz_idle_balance() as an intermediate step to gather the shortest next balance before updating nohz.next_balance. But the former has not been updated yet and is likely to be set with the current jiffies. As a result, the nohz.next_balance will be set with current jiffies instead of the real next balance date. This generates spurious kicks of nohz ilde balance. nohz_idle_balance() must set the nohz.next_balance without taking into account this_rq->next_balance which is not updated yet. Then, this_rq will update nohz.next_update with its next_balance once updated and if necessary. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1438595750-20455-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit c5afb6a87f2386bcf09fa051e6ca390d43e2222e) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dad728758b022c..37a06f3ebf8a02 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7460,8 +7460,22 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) + if (likely(update_next_balance)) { rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } } #ifdef CONFIG_NO_HZ_COMMON @@ -7474,6 +7488,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) @@ -7505,10 +7522,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rebalance_domains(rq, CPU_IDLE); } - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } } - nohz.next_balance = this_rq->next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } From 93c675551d75d4d463a96f82d834a1c166387a8d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:08 +0100 Subject: [PATCH 122/420] sched: Make scale_rt invariant with frequency The average running time of RT tasks is used to estimate the remaining compute capacity for CFS tasks. This remaining capacity is the original capacity scaled down by a factor (aka scale_rt_capacity). This estimation of available capacity must also be invariant with frequency scaling. A frequency scaling factor is applied on the running time of the RT tasks for computing scale_rt_capacity. In sched_rt_avg_update(), we now scale the RT execution time like below: rq->rt_avg += rt_delta * arch_scale_freq_capacity() >> SCHED_CAPACITY_SHIFT Then, scale_rt_capacity can be summarized by: scale_rt_capacity = SCHED_CAPACITY_SCALE * available / total with available = total - rq->rt_avg This has been been optimized in current code by: scale_rt_capacity = available / (total >> SCHED_CAPACITY_SHIFT) But we can also developed the equation like below: scale_rt_capacity = SCHED_CAPACITY_SCALE - ((rq->rt_avg << SCHED_CAPACITY_SHIFT) / total) and we can optimize the equation by removing SCHED_CAPACITY_SHIFT shift in the computation of rq->rt_avg and scale_rt_capacity(). so rq->rt_avg += rt_delta * arch_scale_freq_capacity() and scale_rt_capacity = SCHED_CAPACITY_SCALE - (rq->rt_avg / total) arch_scale_frequency_capacity() will be called in the hot path of the scheduler which implies to have a short and efficient function. As an example, arch_scale_frequency_capacity() should return a cached value that is updated periodically outside of the hot path. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++------------ kernel/sched/sched.h | 4 +++- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 37a06f3ebf8a02..7b5a702e41aec9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5875,7 +5875,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -5891,19 +5891,12 @@ static unsigned long scale_rt_capacity(int cpu) total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } + used = div_u64(avg, total); - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - total >>= SCHED_CAPACITY_SHIFT; - - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2bc7b0de197fcf..6b5b096c7ff153 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1320,9 +1320,11 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); +extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else From 3bc6f4402ea6645478d2197c3d502960c953083f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:14 +0100 Subject: [PATCH 123/420] sched: Move CFS tasks to CPUs with higher capacity When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining capacity for CFS tasks can be significantly reduced. Once we detect such situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle load balance to check if it's worth moving its tasks on an idle CPU. It's worth trying to move the task before the CPU is fully utilized to minimize the preemption by irq or RT tasks. Once the idle load_balance has selected the busiest CPU, it will look for an active load balance for only two cases: - There is only 1 task on the busiest CPU. - We haven't been able to move a task of the busiest rq. A CPU with a reduced capacity is included in the 1st case, and it's worth to actively migrate its task if the idle CPU has got more available capacity for CFS tasks. This test has been added in need_active_balance. As a sidenote, this will not generate more spurious ilb because we already trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that has a task, we will trig the ilb once for migrating the task. The nohz_kick_needed function has been cleaned up a bit while adding the new test env.src_cpu and env.src_rq must be set unconditionnally because they are used in need_active_balance which is called even if busiest->nr_running equals 1 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 69 ++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b5a702e41aec9..79f199577effb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6699,6 +6699,19 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs if more capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(env->src_rq, sd)) && + (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6798,6 +6811,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6807,8 +6823,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -7534,22 +7548,25 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7563,38 +7580,46 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + sched_domain_span(sd)) < cpu)) { + kick = true; + goto unlock; + } -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } From e17250a5a99acbfc991cb06a3c23ff06967bc87a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2015 14:19:05 +0100 Subject: [PATCH 124/420] sched: Optimize freq invariant accounting Currently the freq invariant accounting (in __update_entity_runnable_avg() and sched_rt_avg_update()) get the scale factor from a weak function call, this means that even for archs that default on their implementation the compiler cannot see into this function and optimize the extra scaling math away. This is sad, esp. since its a 64-bit multiplication which can be quite costly on some platforms. So replace the weak function with #ifdef and __always_inline goo. This is not quite as nice from an arch support PoV but should at least result in compile time errors if done wrong. Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: Vincent Guittot Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/20150323131905.GF23123@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ------------ kernel/sched/sched.h | 9 ++++++++- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79f199577effb3..d11c9997036dbf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2284,8 +2284,6 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); - /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -5849,16 +5847,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_capacity(sd, cpu); -} - static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b5b096c7ff153..158205d543c113 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1320,7 +1320,14 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); -extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { From e32ddebe2859fd49ec50a4b965e7048fa334730a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:06 +0100 Subject: [PATCH 125/420] sched: Remove frequency scaling from cpu_capacity Now that arch_scale_cpu_capacity has been introduced to scale the original capacity, the arch_scale_freq_capacity is no longer used (it was previously used by ARM arch). Remove arch_scale_freq_capacity from the computation of cpu_capacity. The frequency invariance will be handled in the load tracking and not in the CPU capacity. arch_scale_freq_capacity will be revisited for scaling load with the current frequency of the CPUs in a later patch. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d11c9997036dbf..57360ef90438fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5901,13 +5901,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); - else - capacity *= default_scale_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; From 6ffa0e8e13527c785c128729b7776c239216b1a8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:09 +0100 Subject: [PATCH 126/420] sched: Add struct rq::cpu_capacity_orig This new field 'cpu_capacity_orig' reflects the original capacity of a CPU before being altered by rt tasks and/or IRQ The cpu_capacity_orig will be used: - to detect when the capacity of a CPU has been noticeably reduced so we can trig load balance to look for a CPU with better capacity. As an example, we can detect when a CPU handles a significant amount of irq (with CONFIG_IRQ_TIME_ACCOUNTING) but this CPU is seen as an idle CPU by scheduler whereas CPUs, which are really idle, are available. - evaluate the available capacity for CFS tasks Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 8 +++++++- kernel/sched/sched.h | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 59768a42c62c0a..110e53afefbd9b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7152,7 +7152,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57360ef90438fc..734aa4335c10d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4226,6 +4226,11 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5899,6 +5904,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); @@ -5953,7 +5959,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); + capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 158205d543c113..bcd78916f320a7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -581,6 +581,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ From 2c3c2f303959ec68ba8315ea8de5dc83d5c11e63 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:09 +0100 Subject: [PATCH 127/420] sched/fair: Make load tracking frequency scale-invariant Apply frequency scaling correction factor to per-entity load tracking to make it frequency invariant. Currently, load appears bigger when the CPU is running slower which affects load-balancing decisions. Each segment of the sched_avg.load_sum geometric series is now scaled by the current frequency so that the sched_avg.load_avg of each sched entity will be invariant from frequency scaling. Moreover, cfs_rq.runnable_load_sum is scaled by the current frequency as well. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e0f5f3afd2cffa96291cd852056d83ff4e2e99c7) Signed-off-by: Ricky Liang --- include/linux/sched.h | 6 +++--- kernel/sched/fair.c | 27 +++++++++++++++++---------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e08d0a8332def..85ed3c0e448a99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1078,9 +1078,9 @@ struct load_weight { /* * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors the amount of time that a sched_entity is - * runnable on a rq into its weight. For cfs_rq, it is the aggregated - * such weights of all runnable and blocked sched_entities. + * 1) load_avg factors frequency scaling into the amount of time that a + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the + * aggregated such weights of all runnable and blocked sched_entities. * 2) util_avg factors frequency scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 734aa4335c10d8..7a3d952c92518d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2284,6 +2284,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2316,9 +2318,9 @@ static __always_inline int __update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, periods; + u64 delta, scaled_delta, periods; u32 contrib; - int delta_w, decayed = 0; + int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_update_time; @@ -2354,13 +2356,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; + scaled_delta_w = scale(delta_w, scale_freq); if (weight) { - sa->load_sum += weight * delta_w; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta_w; + sa->load_sum += weight * scaled_delta_w; + if (cfs_rq) { + cfs_rq->runnable_load_sum += + weight * scaled_delta_w; + } } if (running) - sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta_w; delta -= delta_w; @@ -2377,23 +2382,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); + contrib = scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += contrib; } /* Remainder of delta accrued against u_0` */ + scaled_delta = scale(delta, scale_freq); if (weight) { - sa->load_sum += weight * delta; + sa->load_sum += weight * scaled_delta; if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta; + cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta; sa->period_contrib += delta; From aa1a6cfe3cb6acd24b6622edaffe098f824acb7b Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Aug 2015 17:23:10 +0100 Subject: [PATCH 128/420] sched/fair: Convert arch_scale_cpu_capacity() from weak function to #define Bring arch_scale_cpu_capacity() in line with the recent change of its arch_scale_freq_capacity() sibling in commit dfbca41f3479 ("sched: Optimize freq invariant accounting") from weak function to #define to allow inlining of the function. While at it, remove the ARCH_CAPACITY sched_feature as well. With the change to #define there isn't a straightforward way to allow runtime switch between an arch implementation and the default implementation of arch_scale_cpu_capacity() using sched_feature. The default was to use the arch-specific implementation, but only the arm architecture provides one and that is essentially equivalent to the default implementation. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 8cd5601c50603caa195ce86cc465cb04079ed488) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 22 +--------------------- kernel/sched/features.h | 5 ----- kernel/sched/sched.h | 11 +++++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a3d952c92518d..05441e56e43b7a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5859,19 +5859,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_cpu_capacity(sd, cpu); -} - static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -5901,16 +5888,9 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = SCHED_CAPACITY_SCALE; + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_cpu_capacity(sd, cpu); - else - capacity *= default_scale_cpu_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - cpu_rq(cpu)->cpu_capacity_orig = capacity; sdg->sgc->capacity_orig = capacity; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 44e2b9f1c9cb87..4bd24c5ff8d2be 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) */ SCHED_FEAT(WAKEUP_PREEMPTION, true) -/* - * Use arch dependent cpu capacity functions - */ -SCHED_FEAT(ARCH_CAPACITY, true) - SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bcd78916f320a7..d738b53f39b102 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1330,6 +1330,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); From f487b38cd7399e7418f0a913d72d76df55f8c9cf Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 15 Aug 2015 00:04:41 +0100 Subject: [PATCH 129/420] sched/fair: Make utilization tracking CPU scale-invariant Besides the existing frequency scale-invariance correction factor, apply CPU scale-invariance correction factor to utilization tracking to compensate for any differences in compute capacity. This could be due to micro-architectural differences (i.e. instructions per seconds) between cpus in HMP systems (e.g. big.LITTLE), and/or differences in the current maximum frequency supported by individual cpus in SMP systems. In the existing implementation utilization isn't comparable between cpus as it is relative to the capacity of each individual CPU. Each segment of the sched_avg.util_sum geometric series is now scaled by the CPU performance factor too so the sched_avg.util_avg of each sched entity will be invariant from the particular CPU of the HMP/SMP system on which the sched entity is scheduled. With this patch, the utilization of a CPU stays relative to the max CPU performance of the fastest CPU in the system. In contrast to utilization (sched_avg.util_sum), load (sched_avg.load_sum) should not be scaled by compute capacity. The utilization metric is based on running time which only makes sense when cpus are _not_ fully utilized (utilization cannot go beyond 100% even if more tasks are added), where load is runnable time which isn't limited by the capacity of the CPU and therefore is a better metric for overloaded scenarios. If we run two nice-0 busy loops on two cpus with different compute capacity their load should be similar since their compute demands are the same. We have to assume that the compute demand of any task running on a fully utilized CPU (no spare cycles = 100% utilization) is high and the same no matter of the compute capacity of its current CPU, hence we shouldn't scale load by CPU capacity. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55CE7409.1000700@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e3279a2e6d697e00e74f905851ee7cf532f72b2d) Signed-off-by: Ricky Liang --- include/linux/sched.h | 2 +- kernel/sched/fair.c | 7 ++++--- kernel/sched/sched.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 85ed3c0e448a99..cc77b78b162d52 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1081,7 +1081,7 @@ struct load_weight { * 1) load_avg factors frequency scaling into the amount of time that a * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the * aggregated such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency scaling into the amount of time + * 2) util_avg factors frequency and cpu scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and * blocked sched_entities. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 05441e56e43b7a..95b94ecdf96cc7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2322,6 +2322,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta = now - sa->last_update_time; /* @@ -2365,7 +2366,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += scaled_delta_w; + sa->util_sum += scale(scaled_delta_w, scale_cpu); delta -= delta_w; @@ -2389,7 +2390,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib; + sa->util_sum += scale(contrib, scale_cpu); } /* Remainder of delta accrued against u_0` */ @@ -2400,7 +2401,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += scaled_delta; + sa->util_sum += scale(scaled_delta, scale_cpu); sa->period_contrib += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d738b53f39b102..996154df45cc9c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1334,7 +1334,7 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) return sd->smt_gain / sd->span_weight; return SCHED_CAPACITY_SCALE; From f1b764660ec1731c7859bb5ccd55029a735119fd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 5 Nov 2014 07:44:50 +0800 Subject: [PATCH 130/420] sched/fair: Fix stale overloaded status in the busiest group finding logic Commit caeb178c60f4 ("sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd") changes groups to be ranked in the order of overloaded > imbalance > other, and busiest group is picked according to this order. sgs->group_capacity_factor is used to check if the group is overloaded. When the child domain prefers tasks to go to siblings first, the sgs->group_capacity_factor will be set lower than one in order to move all the excess tasks away. However, group overloaded status is not updated when sgs->group_capacity_factor is set to lower than one, which leads to us missing to find the busiest group. This patch fixes it by updating group overloaded status when sg capacity factor is set to one, in order to find the busiest group accurately. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Vincent Guittot Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1415144690-25196-1-git-send-email-wanpeng.li@linux.intel.com [ Fixed the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95b94ecdf96cc7..672ae50820c8f2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6260,8 +6260,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) + sds->local_stat.group_has_free_capacity) { sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + sgs->group_type = group_classify(sg, sgs); + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; From 384e8900b5ea2728f5d6aba11564133b26e626cb Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:11 +0100 Subject: [PATCH 131/420] sched: Replace capacity_factor by usage The scheduler tries to compute how many tasks a group of CPUs can handle by assuming that a task's load is SCHED_LOAD_SCALE and a CPU's capacity is SCHED_CAPACITY_SCALE. 'struct sg_lb_stats:group_capacity_factor' divides the capacity of the group by SCHED_LOAD_SCALE to estimate how many task can run in the group. Then, it compares this value with the sum of nr_running to decide if the group is overloaded or not. But the 'group_capacity_factor' concept is hardly working for SMT systems, it sometimes works for big cores but fails to do the right thing for little cores. Below are two examples to illustrate the problem that this patch solves: 1- If the original capacity of a CPU is less than SCHED_CAPACITY_SCALE (640 as an example), a group of 3 CPUS will have a max capacity_factor of 2 (div_round_closest(3x640/1024) = 2) which means that it will be seen as overloaded even if we have only one task per CPU. 2 - If the original capacity of a CPU is greater than SCHED_CAPACITY_SCALE (1512 as an example), a group of 4 CPUs will have a capacity_factor of 4 (at max and thanks to the fix [0] for SMT system that prevent the apparition of ghost CPUs) but if one CPU is fully used by rt tasks (and its capacity is reduced to nearly nothing), the capacity factor of the group will still be 4 (div_round_closest(3*1512/1024) = 5 which is cap to 4 with [0]). So, this patch tries to solve this issue by removing capacity_factor and replacing it with the 2 following metrics: - The available CPU's capacity for CFS tasks which is already used by load_balance(). - The usage of the CPU by the CFS tasks. For the latter, utilization_avg_contrib has been re-introduced to compute the usage of a CPU by CFS tasks. 'group_capacity_factor' and 'group_has_free_capacity' has been removed and replaced by 'group_no_capacity'. We compare the number of task with the number of CPUs and we evaluate the level of utilization of the CPUs to define if a group is overloaded or if a group has capacity to handle more tasks. For SD_PREFER_SIBLING, a group is tagged overloaded if it has more than 1 task so it will be selected in priority (among the overloaded groups). Since [1], SD_PREFER_SIBLING is no more concerned by the computation of 'load_above_capacity' because local is not overloaded. [1] 9a5d9ba6a363 ("sched/fair: Allow calculate_imbalance() to move idle cpus") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1425052454-25797-9-git-send-email-vincent.guittot@linaro.org [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 139 +++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 67 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 672ae50820c8f2..3fef22601fb709 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5785,11 +5785,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_has_free_capacity; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5975,28 +5974,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -6034,37 +6020,56 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running < sgs->group_weight) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ + return false; +} + +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. + */ +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running <= sgs->group_weight) + return false; - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - return capacity_factor; + return false; } -static enum group_type -group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_no_capacity) return group_overloaded; if (sg_imbalanced(group)) @@ -6125,11 +6130,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - sgs->group_capacity_factor = sg_capacity_factor(env, group); - sgs->group_type = group_classify(group, sgs); - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -6251,18 +6254,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) { - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - sgs->group_type = group_classify(sg, sgs); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6442,11 +6446,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6509,6 +6514,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6529,8 +6535,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* @@ -6589,7 +6595,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6618,9 +6624,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6628,7 +6631,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* From 04a1199ee30852a19690170de41c82a92342c1cd Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Mar 2015 11:35:03 +0100 Subject: [PATCH 132/420] sched: Remove unused struct sched_group_capacity::capacity_orig The 'struct sched_group_capacity::capacity_orig' field is no longer used in the scheduler so we can remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425378903-5349-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ------------ kernel/sched/fair.c | 13 +++---------- kernel/sched/sched.h | 2 +- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 110e53afefbd9b..1805fd21b56f57 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5434,17 +5434,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5929,7 +5918,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3fef22601fb709..3ac0e34612eb60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5892,7 +5892,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = capacity; - sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -5908,7 +5907,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -5920,7 +5919,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5940,19 +5939,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -5963,13 +5958,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 996154df45cc9c..8238d0a2f6e99d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -762,7 +762,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From ebfe870504f4de0dcce58461fc6d03ca017e6dc9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:12 +0100 Subject: [PATCH 133/420] sched/fair: Name utilization related data and functions consistently Use the advent of the per-entity load tracking rewrite to streamline the naming of utilization related data and functions by using {prefix_}util{_suffix} consistently. Moreover call both signals ({se,cfs}.avg.util_avg) utilization. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 9e91d61d9b0ca8d865dbd59af8d0d5c5b68003e9) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ac0e34612eb60..d03d9fd19eec35 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4649,31 +4649,32 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } /* - * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can - * compare the usage with the capacity of the CPU that is available for CFS - * task (ie cpu_capacity). + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full - * capacity of the CPU because it's about the running time on this CPU. + * [0..SCHED_LOAD_SCALE]. The utilization of a CPU can't be higher than the + * full capacity of the CPU because it's about the running time on this CPU. * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the usage stays into the range + * time. So we need to check that the utilization stays into the range * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the usage, a group could be seen as overloaded (CPU0 usage - * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + * Without capping the utilization, a group could be seen as overloaded (CPU0 + * utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. */ -static int get_cpu_usage(int cpu) +static int cpu_util(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (usage >= SCHED_LOAD_SCALE) + if (util >= SCHED_LOAD_SCALE) return capacity; - return (usage * capacity) >> SCHED_LOAD_SHIFT; + return (util * capacity) >> SCHED_LOAD_SHIFT; } /* @@ -5783,7 +5784,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_usage; /* Total usage of the group */ + unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -6016,8 +6017,8 @@ static inline int sg_imbalanced(struct sched_group *group) * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the * number of task is - * smaller than the number of CPUs or if the usage is lower than the available - * capacity for CFS tasks. + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. @@ -6031,7 +6032,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) return true; if ((sgs->group_capacity * 100) > - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6052,7 +6053,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; if ((sgs->group_capacity * 100) < - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6100,7 +6101,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) From 10e3160c90f6916cb7cb4e2c5899cac6b54b5d79 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:13 +0100 Subject: [PATCH 134/420] sched/fair: Get rid of scaling utilization by capacity_orig Utilization is currently scaled by capacity_orig, but since we now have frequency and cpu invariant cfs_rq.avg.util_avg, frequency and cpu scaling now happens as part of the utilization tracking itself. So cfs_rq.avg.util_avg should no longer be scaled in cpu_util(). Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steve Muckle Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/55EDAF43.30500@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 231678b768da07d19ab5683a39eeb0c250631d02) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d03d9fd19eec35..d1be4d2f6418d7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4648,33 +4648,39 @@ static int select_idle_sibling(struct task_struct *p, int target) done: return target; } + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can * compare the utilization with the capacity of the CPU that is available for * CFS task (ie cpu_capacity). - * cfs.avg.util_avg is the sum of running time of runnable tasks on a - * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The utilization of a CPU can't be higher than the - * full capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in util_avg or just - * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the utilization stays into the range - * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the utilization, a group could be seen as overloaded (CPU0 - * utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). */ static int cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (util >= SCHED_LOAD_SCALE) - return capacity; - - return (util * capacity) >> SCHED_LOAD_SHIFT; + return (util >= capacity) ? capacity : util; } /* From 07bcaa19df60f3f5340eae67615cbe9dd07fa5fa Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Aug 2015 17:23:14 +0100 Subject: [PATCH 135/420] sched/fair: Initialize task load and utilization before placing task on rq Task load or utilization is not currently considered in select_task_rq_fair(), but if we want that in the future we should make sure it is not zero for new tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 98d8fd8126676f7ba6e133e65b2ca4b17989d32c) Signed-off-by: Ricky Liang --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1805fd21b56f57..0023561f962d87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2316,6 +2316,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2325,8 +2327,6 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; From 43b11f7b956cd6bf71fd419fa82e323d7d0661f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2015 15:05:42 +0200 Subject: [PATCH 136/420] sched/fair: Rename scale() to cap_scale() Rename scale() to cap_scale() to better reflect its purpose, it is after all not a general purpose scale function, it has SCHED_CAPACITY_SHIFT hardcoded in it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 54a21385facbdcd89a78e8c3e5025f04c5f2b59c) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d1be4d2f6418d7..42cd65e233093d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2284,7 +2284,7 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* * We can represent the historical contribution to runnable average as the @@ -2357,7 +2357,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; - scaled_delta_w = scale(delta_w, scale_freq); + scaled_delta_w = cap_scale(delta_w, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta_w; if (cfs_rq) { @@ -2366,7 +2366,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += scale(scaled_delta_w, scale_cpu); + sa->util_sum += cap_scale(scaled_delta_w, scale_cpu); delta -= delta_w; @@ -2383,25 +2383,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); - contrib = scale(contrib, scale_freq); + contrib = cap_scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += scale(contrib, scale_cpu); + sa->util_sum += cap_scale(contrib, scale_cpu); } /* Remainder of delta accrued against u_0` */ - scaled_delta = scale(delta, scale_freq); + scaled_delta = cap_scale(delta, scale_freq); if (weight) { sa->load_sum += weight * scaled_delta; if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += scale(scaled_delta, scale_cpu); + sa->util_sum += cap_scale(scaled_delta, scale_cpu); sa->period_contrib += delta; From 617f4b26a898f5f7720b02c55b98993e1a5cff44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2015 15:09:15 +0200 Subject: [PATCH 137/420] sched/fair: Optimize __update_load_avg() Prior to this patch; the line: scaled_delta_w = (delta_w * 1024) >> 10; which is the result of the default arch_scale_freq_capacity() function, turns into: 1b03: 49 89 d1 mov %rdx,%r9 1b06: 49 c1 e1 0a shl $0xa,%r9 1b0a: 49 c1 e9 0a shr $0xa,%r9 Which is silly; when made unsigned int, GCC recognises this as pointless ops and fails to emit them (confirmed on 4.9.3 and 5.1.1). Furthermore, afaict unsigned is actually the correct type for these fields anyway, as we've explicitly ruled out negative delta's earlier in this function. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 6115c793ca1a6e39c7c15159cbb47baa04009cb8) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42cd65e233093d..db61ff07861a4e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2320,7 +2320,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, { u64 delta, scaled_delta, periods; u32 contrib; - int delta_w, scaled_delta_w, decayed = 0; + unsigned int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); From 1f2a0fdc6bce90b107cfe36e576138d1921c1643 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 7 Sep 2015 14:57:22 +0100 Subject: [PATCH 138/420] sched/fair: Defer calling scaling functions Do not call the scaling functions in case time goes backwards or the last update of the sched_avg structure has happened less than 1024ns ago. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/55EDA2E9.8040900@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 6f2b04524f0b38bfbb8413f98d2d6af234508309) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index db61ff07861a4e..c57c2e37b9d140 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2321,8 +2321,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u64 delta, scaled_delta, periods; u32 contrib; unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + unsigned long scale_freq, scale_cpu; delta = now - sa->last_update_time; /* @@ -2343,6 +2342,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { From fc5a5cfd95479c6d57b52990447440b6e8f74f16 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 09:06:17 +0200 Subject: [PATCH 139/420] sched/fair: Optimize per entity utilization tracking Currently the load_{sum,avg} and util_{sum,avg} tracking is asymmetric in that load tracking gets a 2^10 unit from the weight, but util gets no such factor. This results in more lost bits for util scaling and asymmetric scaling rules. Fix this by removing shifts, such that we gain the 2^10 factor from scaling. There is no risk of overflowing the u32 as the max value is now LOAD_AVG_MAX << 10, which is still well below UINT_MAX. This further entangles the assumption that both LOAD and CAPACITY shifts are the same (and 10) so put in an assertion for that. This fixes the math for the LOAD_RESOLUTION != 0 case. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar (cherry picked from commit 006cdf025a33cb008c3d466bed311c2c347b458f) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c57c2e37b9d140..3cc6475cbdb6ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -682,7 +682,7 @@ void init_entity_runnable_average(struct sched_entity *se) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = LOAD_AVG_MAX; + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -2284,6 +2284,10 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 +#error "load tracking assumes 2^10 as unit" +#endif + #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* @@ -2368,7 +2372,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += cap_scale(scaled_delta_w, scale_cpu); + sa->util_sum += scaled_delta_w * scale_cpu; delta -= delta_w; @@ -2392,7 +2396,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += cap_scale(contrib, scale_cpu); + sa->util_sum += contrib * scale_cpu; } /* Remainder of delta accrued against u_0` */ @@ -2403,7 +2407,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += cap_scale(scaled_delta, scale_cpu); + sa->util_sum += scaled_delta * scale_cpu; sa->period_contrib += delta; @@ -2413,7 +2417,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_avg = div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } - sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; } return decayed; @@ -2455,8 +2459,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - - ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, From a427e3cc013eaf968dffd6e2be85ae2d356e6fcf Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 13 Oct 2015 09:18:22 +0800 Subject: [PATCH 140/420] sched/fair: Fix overly small weight for interactive group entities Commit: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") led to an overly small weight for interactive group entities. The bad case can be easily reproduced when a number of CPU hogs compete for the CPUs at the same time (thanks to Mike). This is largly because the task group's load average tracking cross CPUs lags behind the real changes. To fix this we accelerate the group share distribution process by using the load.weight of the cfs_rq. This may increase the entire group's share, but we have to do so to protect the (fragile) interactive tasks, especially from CPU hogs. Reported-by: Mike Galbraith Tested-by: Dietmar Eggemann Tested-by: Mike Galbraith Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444699103-20272-1-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit fde7d22e01aa0d252fc5c95fa11f0dac35a4dd59) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3cc6475cbdb6ad..27d92e7d8a6d8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2132,7 +2132,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq_load_avg(cfs_rq); + tg_weight += cfs_rq->load.weight; return tg_weight; } @@ -2142,7 +2142,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq_load_avg(cfs_rq); + load = cfs_rq->load.weight; shares = (tg->shares * load); if (tg_weight) From 9b9ab8af7e9a5defcdecf7f112f7dfeba2dc6225 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 13 Oct 2015 09:18:23 +0800 Subject: [PATCH 141/420] sched/fair: Update task group's load_avg after task migration When cfs_rq has cfs_rq->removed_load_avg set (when a task migrates from this cfs_rq), we need to update its contribution to the group's load_avg. This should not increase tg's update too much, because in most cases, the cfs_rq has already decayed its load_avg. Tested-by: Dietmar Eggemann Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444699103-20272-2-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 3e386d56bafbb6d2540b49367444997fc671ea69) Signed-off-by: Ricky Liang --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27d92e7d8a6d8a..f61bb4c9055383 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2447,13 +2447,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - int decayed; struct sched_avg *sa = &cfs_rq->avg; + int decayed, removed = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + removed = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -2470,7 +2471,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed; + return decayed || removed; } /* Update task and its cfs_rq load average */ From a5171bb9cc4fe28a54aee1219bf0f1c095932d81 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Oct 2015 13:49:30 +0200 Subject: [PATCH 142/420] sched/fair: Clean up the explanation around decaying load update misses Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar (cherry picked from commit d937cdc59e363baf8d5c757d944b13ebfa33e729) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 53 ++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f61bb4c9055383..9fe2330d3046a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4019,42 +4019,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ /* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * The exact cpuload calculated at every tick would be: * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load + * + * If a cpu misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when cpu may be busy, then we have: + * + * load_n = (1 - 1/2^i)^n * load_0 + * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load * * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * load' = (1 - 1/2^i)^n * load + * + * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. + * This allows us to precompute the above in said factors, thereby allowing the + * reduction of an arbitrary n in O(log_2 n) steps. (See also + * fixed_power_int()) * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. + * The calculation is approximated on a 128 point scale. */ #define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; + +static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 64, 32, 8, 0, 0, 0, 0, 0 }, + { 96, 72, 40, 12, 1, 0, 0, 0 }, + { 112, 98, 75, 43, 15, 1, 0, 0 }, + { 120, 112, 98, 76, 45, 16, 2, 0 } +}; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog From 807874be7a96e9ffce86ae090407c1f3e158b22b Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 18 Nov 2015 09:34:59 +0900 Subject: [PATCH 143/420] sched/fair: Modify the comment about lock assumptions in migrate_task_rq_fair() The comment describing migrate_task_rq_fair() says that the caller should hold p->pi_lock. But in some cases the caller can hold task_rq(p)->lock instead of p->pi_lock. So the comment is broken and this patch fixes it. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447806899-20303-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit 525628c73bd6af65f27d927e699e7460d7d55ed3) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9fe2330d3046a3..10ef8b40c01888 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4784,8 +4784,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. However, the caller only guarantees p->pi_lock is held; no - * other assumptions, including the state of rq->lock, should be made. + * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { From 6a8d496f305e607a638f31f3896eafee9c6295c7 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Sat, 24 Oct 2015 01:16:19 +0900 Subject: [PATCH 144/420] sched/fair: Make it possible to account fair load avg consistently The current code accounts for the time a task was absent from the fair class (per ATTACH_AGE_LOAD). However it does not work correctly when a task got migrated or moved to another cgroup while outside of the fair class. This patch tries to address that by aging on migration. We locklessly read the 'last_update_time' stamp from both the old and new cfs_rq, ages the load upto the old time, and sets it to the new time. These timestamps should in general not be more than 1 tick apart from one another, so there is a definite bound on things. Signed-off-by: Byungchul Park [ Changelog, a few edits and !SMP build fix ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1445616981-29904-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry picked from commit ad936d8658fd348338cb7d42c577dac77892b074) Signed-off-by: Javi Merino --- kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 11 ++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0023561f962d87..aab70cdc423225 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2023,6 +2023,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10ef8b40c01888..371ede72f1990b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2438,6 +2438,52 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8238d0a2f6e99d..eea3c77461e1d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -310,7 +310,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -852,6 +860,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif From a4c85e5d9cbf6170721469fb4f91ef65c3c5fe1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Nov 2015 14:09:38 -0500 Subject: [PATCH 145/420] sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() Part of the responsibility of the update_sg_lb_stats() function is to update the idle_cpus statistical counter in struct sg_lb_stats. This check is done by calling idle_cpu(). The idle_cpu() function, in turn, checks a number of fields within the run queue structure such as rq->curr and rq->nr_running. With the current layout of the run queue structure, rq->curr and rq->nr_running are in separate cachelines. The rq->curr variable is checked first followed by nr_running. As nr_running is also accessed by update_sg_lb_stats() earlier, it makes no sense to load another cacheline when nr_running is not 0 as idle_cpu() will always return false in this case. This patch eliminates this redundant cacheline load by checking the cached nr_running before calling idle_cpu(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas Hatch Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448478580-26467-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar (cherry picked from commit a426f99c91d1036767a7819aaaba6bd3191b7f06) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 371ede72f1990b..c0f0d149aef24c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6139,7 +6139,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -6156,7 +6156,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6164,7 +6165,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; } From 6fa7a207ba3b26a35209917ea5da1586487e882e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 2 Dec 2015 13:41:49 -0500 Subject: [PATCH 146/420] sched/fair: Move the cache-hot 'load_avg' variable into its own cacheline If a system with large number of sockets was driven to full utilization, it was found that the clock tick handling occupied a rather significant proportion of CPU time when fair group scheduling and autogroup were enabled. Running a java benchmark on a 16-socket IvyBridge-EX system, the perf profile looked like: 10.52% 0.00% java [kernel.vmlinux] [k] smp_apic_timer_interrupt 9.66% 0.05% java [kernel.vmlinux] [k] hrtimer_interrupt 8.65% 0.03% java [kernel.vmlinux] [k] tick_sched_timer 8.56% 0.00% java [kernel.vmlinux] [k] update_process_times 8.07% 0.03% java [kernel.vmlinux] [k] scheduler_tick 6.91% 1.78% java [kernel.vmlinux] [k] task_tick_fair 5.24% 5.04% java [kernel.vmlinux] [k] update_cfs_shares In particular, the high CPU time consumed by update_cfs_shares() was mostly due to contention on the cacheline that contained the task_group's load_avg statistical counter. This cacheline may also contains variables like shares, cfs_rq & se which are accessed rather frequently during clock tick processing. This patch moves the load_avg variable into another cacheline separated from the other frequently accessed variables. It also creates a cacheline aligned kmemcache for task_group to make sure that all the allocated task_group's are cacheline aligned. By doing so, the perf profile became: 9.44% 0.00% java [kernel.vmlinux] [k] smp_apic_timer_interrupt 8.74% 0.01% java [kernel.vmlinux] [k] hrtimer_interrupt 7.83% 0.03% java [kernel.vmlinux] [k] tick_sched_timer 7.74% 0.00% java [kernel.vmlinux] [k] update_process_times 7.27% 0.03% java [kernel.vmlinux] [k] scheduler_tick 5.94% 1.74% java [kernel.vmlinux] [k] task_tick_fair 4.15% 3.92% java [kernel.vmlinux] [k] update_cfs_shares The %cpu time is still pretty high, but it is better than before. The benchmark results before and after the patch was as follows: Before patch - Max-jOPs: 907533 Critical-jOps: 134877 After patch - Max-jOPs: 916011 Critical-jOps: 142366 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Douglas Hatch Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Cc: Yuyang Du Link: http://lkml.kernel.org/r/1449081710-20185-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar (cherry picked from commit b0367629acf62a78404c467cd09df447c2fea804) Signed-off-by: Javi Merino --- kernel/sched/core.c | 10 +++++++--- kernel/sched/sched.h | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aab70cdc423225..4a1b28817417b7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7027,6 +7027,9 @@ int in_sched_functions(unsigned long addr) */ struct task_group root_task_group; LIST_HEAD(task_groups); + +/* Cacheline aligned slab cache for task_group */ +static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); @@ -7087,11 +7090,12 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED + task_group_cache = KMEM_CACHE(task_group, 0); + list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); - #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { @@ -7383,7 +7387,7 @@ static void free_sched_group(struct task_group *tg) free_fair_sched_group(tg); free_rt_sched_group(tg); autogroup_free(tg); - kfree(tg); + kmem_cache_free(task_group_cache, tg); } /* allocate runqueue etc for a new task group */ @@ -7391,7 +7395,7 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); + tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); if (!tg) return ERR_PTR(-ENOMEM); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eea3c77461e1d4..2f63ae6d760736 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -223,7 +223,12 @@ struct task_group { unsigned long shares; #ifdef CONFIG_SMP - atomic_long_t load_avg; + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; #endif #endif From 6ff9b8e68593f7bd07a50ae8d95efba76714a5dd Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 2 Dec 2015 13:41:50 -0500 Subject: [PATCH 147/420] sched/fair: Disable the task group load_avg update for the root_task_group Currently, the update_tg_load_avg() function attempts to update the tg's load_avg value whenever the load changes even for root_task_group where the load_avg value will never be used. This patch will disable the load_avg update when the given task group is the root_task_group. Running a Java benchmark with noautogroup and a 4.3 kernel on a 16-socket IvyBridge-EX system, the amount of CPU time (as reported by perf) consumed by task_tick_fair() which includes update_tg_load_avg() decreased from 0.71% to 0.22%, a more than 3X reduction. The Max-jOPs results also increased slightly from 983015 to 986449. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas Hatch Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Cc: Yuyang Du Link: http://lkml.kernel.org/r/1449081710-20185-4-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar (cherry picked from commit aa0b7ae06387d40a988ce16a189082dee6e570bc) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0f0d149aef24c..e2e796744e944a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2432,6 +2432,12 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; From 571180009759f0926cb9faee8bc39c2d359e2ca3 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Thu, 17 Dec 2015 07:34:27 +0800 Subject: [PATCH 148/420] sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() If a newly created task is selected to go to a different CPU in fork balance when it wakes up the first time, its load averages should not be removed from the source CPU since they are never added to it before. The same is also applicable to a never used group entity. Fix it in remove_entity_load_avg(): when entity's last_update_time is 0, simply return. This should precisely identify the case in question, because in other migrations, the last_update_time is set to 0 after remove_entity_load_avg(). Reported-by: Steve Muckle Signed-off-by: Yuyang Du [peterz: cfs_rq_last_update_time] Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 0905f04eb21fc1c2e690bed5d0418a061d56c225) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e2e796744e944a..9813f65262f95c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2623,27 +2623,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); + + return last_update_time; +} #else - last_update_time = cfs_rq->avg.last_update_time; +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} #endif +/* + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). + */ +void remove_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); From 7e14d7d52bb60ad70cc760f91894c694db59e613 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 17 Sep 2015 16:10:56 +0100 Subject: [PATCH 149/420] cpufreq: Frequency invariant scheduler load-tracking support Implements cpufreq_scale_freq_capacity() to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is: current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for freq_scale. Change-Id: I8aa1c424a33794db78c35a80c57184d992e71d2b Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 29 +++++++++++++++++++++++++++++ include/linux/cpufreq.h | 2 ++ 2 files changed, 31 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e3aabdf458e5f0..b12a7c2fc34292 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -278,6 +278,31 @@ static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) } #endif +/********************************************************************* + * FREQUENCY INVARIANT CPU CAPACITY * + *********************************************************************/ + +static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +static void +scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) +{ + unsigned long cur = freqs ? freqs->new : policy->cur; + unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + int cpu; + + pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", + cpumask_pr_args(policy->cpus), cur, policy->max, scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(freq_scale, cpu) = scale; +} + +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { @@ -381,6 +406,8 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, spin_unlock(&policy->transition_lock); + scale_freq_capacity(policy, freqs); + cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); @@ -2200,6 +2227,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY, new_policy); + scale_freq_capacity(new_policy, NULL); + policy->min = new_policy->min; policy->max = new_policy->max; trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 56ea8a336016a6..640c6794e311c8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -609,4 +609,6 @@ int cpufreq_generic_init(struct cpufreq_policy *policy, void acct_update_power(struct task_struct *p, cputime_t cputime); +struct sched_domain; +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif /* _LINUX_CPUFREQ_H */ From 2bc3b001f14119814400af6cc1271b272ed08bf9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:15:11 +0100 Subject: [PATCH 150/420] arm64: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_freq_capacity() has to be declared extern in topology.h. Change-Id: I492c1851063220336b4ef6834303f2d44a6daf7a Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 7ebcd31ce51cae..287c37ec78d500 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -24,6 +24,12 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +struct sched_domain; +extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +#endif + #else static inline void init_cpu_topology(void) { } From e9652ed9b812914ff6c3958482076041f86acc92 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 7 May 2015 18:46:15 +0100 Subject: [PATCH 151/420] sched: Store system-wide maximum cpu capacity in root domain To be able to compare the capacity of the target cpu with the highest cpu capacity of the system in the wakeup path, store the system-wide maximum cpu capacity in the root domain. Change-Id: I9420f08940f3da18af4111e0b9c982d513a89e1a cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a1b28817417b7..b06344569f2937 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6658,6 +6658,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6708,11 +6709,18 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); + + if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) + rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); + if (rq) + pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f63ae6d760736..1bb98217b315ee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -520,6 +520,9 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + /* Maximum cpu capacity in the system. */ + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; From 927ef969dcd593efd278f8b60f479023ed7efc40 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 19:53:49 +0100 Subject: [PATCH 152/420] sched: Add cpu capacity awareness to wakeup balancing Wakeup balancing is completely unaware of cpu capacity, cpu utilization and task utilization. The task is preferably placed on a cpu which is idle in the instant the wakeup happens. New tasks (SD_BALANCE_{FORK,EXEC} are placed on an idle cpu in the idlest group if such can be found, otherwise it goes on the least loaded one. Existing tasks (SD_BALANCE_WAKE) are placed on the previous cpu or an idle cpu sharing the same last level cache unless the wakee_flips heuristic in wake_wide() decides to fallback to considering cpus outside SD_LLC. Hence existing tasks are not guaranteed to get a chance to migrate to a different group at wakeup in case the current one has reduced cpu capacity (due RT/IRQ pressure or different uarch e.g. ARM big.LITTLE). They may eventually get pulled by other cpus doing periodic/idle/nohz_idle balance, but it may take quite a while before it happens. This patch adds capacity awareness to find_idlest_{group,queue} (used by SD_BALANCE_{FORK,EXEC} and SD_BALANCE_WAKE under certain circumstances) such that groups/cpus that can accommodate the waking task based on task utilization are preferred. In addition, wakeup of existing tasks (SD_BALANCE_WAKE) is sent through find_idlest_{group,queue} also if the task doesn't fit the capacity of the previous cpu to allow it to escape (override wake_affine) when necessary instead of relying on periodic/idle/nohz_idle balance to eventually sort it out. Change-Id: Icbf0126ba952152eede67297900d2c6b96ce6e0d cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9813f65262f95c..7879d850c9284a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4568,6 +4568,43 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +static unsigned int capacity_margin = 1280; /* ~20% margin */ + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static int cpu_util(int cpu); + +static inline bool task_fits_spare(struct task_struct *p, int cpu) +{ + return __task_fits(p, cpu, cpu_util(cpu)); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4577,7 +4614,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4608,6 +4647,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + /* + * Look for most energy-efficient group that can fit + * that can fit the task. + */ + if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { + fit_capacity = capacity_of(i); + fit_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4621,6 +4669,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); + if (fit_group) + return fit_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4641,7 +4692,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { + if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -4653,7 +4704,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && + } else if (idle_cpu(i) && + (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -4662,6 +4714,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } else if (shallowest_idle_cpu == -1) { + /* + * If we haven't found an idle CPU yet + * pick a non-idle one that can fit the task as + * fallback. + */ + shallowest_idle_cpu = i; } } else { load = weighted_cpuload(i); @@ -4779,7 +4838,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { From d707992f9f561e22a2898472c827619b51760128 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 6 Jul 2015 15:01:10 +0100 Subject: [PATCH 153/420] sched: Consider spare cpu capacity at task wake-up find_idlest_group() selects the wake-up target group purely based on group load which leads to suboptimal choices in low load scenarios. An idle group with reduced capacity (due to RT tasks or different cpu type) isn't necessarily a better target than a lightly loaded group with higher capacity. The patch adds spare capacity as an additional group selection parameter. The target group is now selected based on the following criteria: 1. Return the group with the cpu with most spare capacity and this capacity is significant if such group exists. Significant spare capacity is currently at least 20% to spare. 2. Return the group with the lowest load, unless it is the local group in which case NULL is returned and the search is continued at the next (lower) level. Change-Id: I50c1df2c5a5810e7b09875599d2267dc0138c809 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7879d850c9284a..dedf387d975c39 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4614,9 +4614,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL; + struct sched_group *fit_group = NULL, *spare_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; + unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4624,7 +4625,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_capacity; int local_group; int i; @@ -4656,6 +4657,16 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, fit_capacity = capacity_of(i); fit_group = group; } + + /* + * Look for group which has most spare capacity on a + * single cpu. + */ + spare_capacity = capacity_of(i) - cpu_util(i); + if (spare_capacity > max_spare_capacity) { + max_spare_capacity = spare_capacity; + spare_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4672,6 +4683,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (fit_group) return fit_group; + if (spare_group) + return spare_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From 0dc566cd1f0584bba977513d624efecf93421d17 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 26 Jan 2015 19:47:28 +0000 Subject: [PATCH 154/420] sched: Enable idle balance to pull single task towards cpu with higher capacity We do not want to miss out on the ability to pull a single remaining task from a potential source cpu towards an idle destination cpu. Add an extra criteria to need_active_balance() to kick off active load balance if the source cpu is over-utilized and has lower capacity than the destination cpu. Change-Id: I806386916122160260f0d564d39f885d6b04b723 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dedf387d975c39..2ca2655d4b526f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4605,6 +4605,11 @@ static inline bool task_fits_spare(struct task_struct *p, int cpu) return __task_fits(p, cpu, cpu_util(cpu)); } +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -6842,6 +6847,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From ab92f12e3664b97a8d573e6da381c3de012e6803 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 155/420] sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. Change-Id: Ifb3e4305f31cb438ef2156278fed3bad501babc0 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ca2655d4b526f..c6bb1fc215a297 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5446,6 +5446,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6438,6 +6439,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) @@ -7066,7 +7069,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); From 034e0aa148f1be164e3c273f66577a20dfc5a995 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:43:28 +0000 Subject: [PATCH 156/420] sched: Documentation for scheduler energy cost model This documentation patch provides an overview of the experimental scheduler energy costing model, associated data structures, and a reference recipe on how platforms can be characterized to derive energy models. Change-Id: I660906d9e23f79e2e0c4a0459c63ade62c61c4fa Signed-off-by: Morten Rasmussen --- Documentation/scheduler/sched-energy.txt | 362 +++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 Documentation/scheduler/sched-energy.txt diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt new file mode 100644 index 00000000000000..dab2f9088b336f --- /dev/null +++ b/Documentation/scheduler/sched-energy.txt @@ -0,0 +1,362 @@ +Energy cost model for energy-aware scheduling (EXPERIMENTAL) + +Introduction +============= + +The basic energy model uses platform energy data stored in sched_group_energy +data structures attached to the sched_groups in the sched_domain hierarchy. The +energy cost model offers two functions that can be used to guide scheduling +decisions: + +1. static unsigned int sched_group_energy(struct energy_env *eenv) +2. static int energy_diff(struct energy_env *eenv) + +sched_group_energy() estimates the energy consumed by all cpus in a specific +sched_group including any shared resources owned exclusively by this group of +cpus. Resources shared with other cpus are excluded (e.g. later level caches). + +energy_diff() estimates the total energy impact of a utilization change. That +is, adding, removing, or migrating utilization (tasks). + +Both functions use a struct energy_env to specify the scenario to be evaluated: + + struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + }; + +sg_top: sched_group to be evaluated. Not used by energy_diff(). + +sg_cap: sched_group covering the cpus in the same frequency domain. Set by +sched_group_energy(). + +cap_idx: Capacity state to be used for energy calculations. Set by +find_new_capacity(). + +util_delta: Amount of utilization to be added, removed, or migrated. + +src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be +-1 if no source (e.g. task wake-up). + +dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1 +if utilization is removed (e.g. terminating tasks). + +energy: Result of sched_group_energy(). + +The metric used to represent utilization is the actual per-entity running time +averaged over time using a geometric series. Very similar to the existing +per-entity load-tracking, but _not_ scaled by task priority and capped by the +capacity of the cpu. The latter property does mean that utilization may +underestimate the compute requirements for task on fully/over utilized cpus. +The greatest potential for energy savings without affecting performance too much +is scenarios where the system isn't fully utilized. If the system is deemed +fully utilized load-balancing should be done with task load (includes task +priority) instead in the interest of fairness and performance. + + +Background and Terminology +=========================== + +To make it clear from the start: + +energy = [joule] (resource like a battery on powered devices) +power = energy/time = [joule/second] = [watt] + +The goal of energy-aware scheduling is to minimize energy, while still getting +the job done. That is, we want to maximize: + + performance [inst/s] + -------------------- + power [W] + +which is equivalent to minimizing: + + energy [J] + ----------- + instruction + +while still getting 'good' performance. It is essentially an alternative +optimization objective to the current performance-only objective for the +scheduler. This alternative considers two objectives: energy-efficiency and +performance. Hence, there needs to be a user controllable knob to switch the +objective. Since it is early days, this is currently a sched_feature +(ENERGY_AWARE). + +The idea behind introducing an energy cost model is to allow the scheduler to +evaluate the implications of its decisions rather than applying energy-saving +techniques blindly that may only have positive effects on some platforms. At +the same time, the energy cost model must be as simple as possible to minimize +the scheduler latency impact. + +Platform topology +------------------ + +The system topology (cpus, caches, and NUMA information, not peripherals) is +represented in the scheduler by the sched_domain hierarchy which has +sched_groups attached at each level that covers one or more cpus (see +sched-domains.txt for more details). To add energy awareness to the scheduler +we need to consider power and frequency domains. + +Power domain: + +A power domain is a part of the system that can be powered on/off +independently. Power domains are typically organized in a hierarchy where you +may be able to power down just a cpu or a group of cpus along with any +associated resources (e.g. shared caches). Powering up a cpu means that all +power domains it is a part of in the hierarchy must be powered up. Hence, it is +more expensive to power up the first cpu that belongs to a higher level power +domain than powering up additional cpus in the same high level domain. Two +level power domain hierarchy example: + + Power source + +-------------------------------+----... +per group PD G G + | +----------+ | + +--------+-------| Shared | (other groups) +per-cpu PD G G | resource | + | | +----------+ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + +Frequency domain: + +Frequency domains (P-states) typically cover the same group of cpus as one of +the power domain levels. That is, there might be several smaller power domains +sharing the same frequency (P-state) or there might be a power domain spanning +multiple frequency domains. + +From a scheduling point of view there is no need to know the actual frequencies +[Hz]. All the scheduler cares about is the compute capacity available at the +current state (P-state) the cpu is in and any other available states. For that +reason, and to also factor in any cpu micro-architecture differences, compute +capacity scaling states are called 'capacity states' in this document. For SMP +systems this is equivalent to P-states. For mixed micro-architecture systems +(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture +performance relative to the other cpus in the system. + +Energy modelling: +------------------ + +Due to the hierarchical nature of the power domains, the most obvious way to +model energy costs is therefore to associate power and energy costs with +domains (groups of cpus). Energy costs of shared resources are associated with +the group of cpus that share the resources, only the cost of powering the +cpu itself and any private resources (e.g. private L1 caches) is associated +with the per-cpu groups (lowest level). + +For example, for an SMP system with per-cpu power domains and a cluster level +(group of cpus) power domain we get the overall energy costs to be: + + energy = energy_cluster + n * energy_cpu + +where 'n' is the number of cpus powered up and energy_cluster is the cost paid +as soon as any cpu in the cluster is powered up. + +The power and frequency domains can naturally be mapped onto the existing +sched_domain hierarchy and sched_groups by adding the necessary data to the +existing data structures. + +The energy model considers energy consumption from two contributors (shown in +the illustration below): + +1. Busy energy: Energy consumed while a cpu and the higher level groups that it +belongs to are busy running tasks. Busy energy is associated with the state of +the cpu, not an event. The time the cpu spends in this state varies. Thus, the +most obvious platform parameter for this contribution is busy power +(energy/time). + +2. Idle energy: Energy consumed while a cpu and higher level groups that it +belongs to are idle (in a C-state). Like busy energy, idle energy is associated +with the state of the cpu. Thus, the platform parameter for this contribution +is idle power (energy/time). + +Energy consumed during transitions from an idle-state (C-state) to a busy state +(P-state) or going the other way is ignored by the model to simplify the energy +model calculations. + + + Power + ^ + | busy->idle idle->busy + | transition transition + | + | _ __ + | / \ / \__________________ + |______________/ \ / + | \ / + | Busy \ Idle / Busy + | low P-state \____________/ high P-state + | + +------------------------------------------------------------> time + +Busy |--------------| |-----------------| + +Wakeup |------| |------| + +Idle |------------| + + +The basic algorithm +==================== + +The basic idea is to determine the total energy impact when utilization is +added or removed by estimating the impact at each level in the sched_domain +hierarchy starting from the bottom (sched_group contains just a single cpu). +The energy cost comes from busy time (sched_group is awake because one or more +cpus are busy) and idle time (in an idle-state). Energy model numbers account +for energy costs associated with all cpus in the sched_group as a group. + + for_each_domain(cpu, sd) { + sg = sched_group_of(cpu) + energy_before = curr_util(sg) * busy_power(sg) + + (1-curr_util(sg)) * idle_power(sg) + energy_after = new_util(sg) * busy_power(sg) + + (1-new_util(sg)) * idle_power(sg) + energy_diff += energy_before - energy_after + + } + + return energy_diff + +{curr, new}_util: The cpu utilization at the lowest level and the overall +non-idle time for the entire group for higher levels. Utilization is in the +range 0.0 to 1.0 in the pseudo-code. + +busy_power: The power consumption of the sched_group. + +idle_power: The power consumption of the sched_group when idle. + +Note: It is a fundamental assumption that the utilization is (roughly) scale +invariant. Task utilization tracking factors in any frequency scaling and +performance scaling differences due to difference cpu microarchitectures such +that task utilization can be used across the entire system. + + +Platform energy data +===================== + +struct sched_group_energy can be attached to sched_groups in the sched_domain +hierarchy and has the following members: + +cap_states: + List of struct capacity_state representing the supported capacity states + (P-states). struct capacity_state has two members: cap and power, which + represents the compute capacity and the busy_power of the state. The + list must be ordered by capacity low->high. + +nr_cap_states: + Number of capacity states in cap_states list. + +idle_states: + List of struct idle_state containing idle_state power cost for each + idle-state supported by the system orderd by shallowest state first. + All states must be included at all level in the hierarchy, i.e. a + sched_group spanning just a single cpu must also include coupled + idle-states (cluster states). In addition to the cpuidle idle-states, + the list must also contain an entry for the idling using the arch + default idle (arch_idle_cpu()). Despite this state may not be a true + hardware idle-state it is considered the shallowest idle-state in the + energy model and must be the first entry. cpus may enter this state + (possibly 'active idling') if cpuidle decides not enter a cpuidle + idle-state. Default idle may not be used when cpuidle is enabled. + In this case, it should just be a copy of the first cpuidle idle-state. + +nr_idle_states: + Number of idle states in idle_states list. + +There are no unit requirements for the energy cost data. Data can be normalized +with any reference, however, the normalization must be consistent across all +energy cost data. That is, one bogo-joule/watt must be the same quantity for +data, but we don't care what it is. + +A recipe for platform characterization +======================================= + +Obtaining the actual model data for a particular platform requires some way of +measuring power/energy. There isn't a tool to help with this (yet). This +section provides a recipe for use as reference. It covers the steps used to +characterize the ARM TC2 development platform. This sort of measurements is +expected to be done anyway when tuning cpuidle and cpufreq for a given +platform. + +The energy model needs two types of data (struct sched_group_energy holds +these) for each sched_group where energy costs should be taken into account: + +1. Capacity state information + +A list containing the compute capacity and power consumption when fully +utilized attributed to the group as a whole for each available capacity state. +At the lowest level (group contains just a single cpu) this is the power of the +cpu alone without including power consumed by resources shared with other cpus. +It basically needs to fit the basic modelling approach described in "Background +and Terminology" section: + + energy_system = energy_shared + n * energy_cpu + +for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at +the lowest level. 'energy_shared' is included at the next level which +represents the group of cpus among which the resources are shared. + +This model is, of course, a simplification of reality. Thus, power/energy +attributions might not always exactly represent how the hardware is designed. +Also, busy power is likely to depend on the workload. It is therefore +recommended to use a representative mix of workloads when characterizing the +capacity states. + +If the group has no capacity scaling support, the list will contain a single +state where power is the busy power attributed to the group. The capacity +should be set to a default value (1024). + +When frequency domains include multiple power domains, the group representing +the frequency domain and all child groups share capacity states. This must be +indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at +all levels that share the capacity state must have the list of capacity states +with the power set to the contribution of the individual group. + +2. Idle power information + +Stored in the idle_states list. The power number is the group idle power +consumption in each idle state as well when the group is idle but has not +entered an idle-state ('active idle' as mentioned earlier). Due to the way the +energy model is defined, the idle power of the deepest group idle state can +alternatively be accounted for in the parent group busy power. In that case the +group idle state power values are offset such that the idle power of the +deepest state is zero. It is less intuitive, but it is easier to measure as +idle power consumed by the group and the busy/idle power of the parent group +cannot be distinguished without per group measurement points. + +Measuring capacity states and idle power: + +The capacity states' capacity and power can be estimated by running a benchmark +workload at each available capacity state. By restricting the benchmark to run +on subsets of cpus it is possible to extrapolate the power consumption of +shared resources. + +ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a +shared L2 cache. TC2 has on-chip energy counters per cluster. Running a +benchmark workload on just one cpu in a cluster means that power is consumed in +the cluster (higher level group) and a single cpu (lowest level group). Adding +another benchmark task to another cpu increases the power consumption by the +amount consumed by the additional cpu. Hence, it is possible to extrapolate the +cluster busy power. + +For platforms that don't have energy counters or equivalent instrumentation +built-in, it may be possible to use an external DAQ to acquire similar data. + +If the benchmark includes some performance score (for example sysbench cpu +benchmark), this can be used to record the compute capacity. + +Measuring idle power requires insight into the idle state implementation on the +particular platform. Specifically, if the platform has coupled idle-states (or +package states). To measure non-coupled per-cpu idle-states it is necessary to +keep one cpu busy to keep any shared resources alive to isolate the idle power +of the cpu from idle/busy power of the shared resources. The cpu can be tricked +into different per-cpu idle states by disabling the other states. Based on +various combinations of measurements with specific cpus busy and disabling +idle-states it is possible to extrapolate the idle-state power. From b99db932c71c46d2b38a367741cf97e5a63da1dc Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:45:51 +0000 Subject: [PATCH 157/420] sched: Make energy awareness a sched feature This patch introduces the ENERGY_AWARE sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set false when SCHED_DEBUG is not defined. Hence this doesn't allow energy awareness to be enabled without SCHED_DEBUG. This sched_feature knob will be replaced later with a more appropriate control knob when things have matured a bit. ENERGY_AWARE is based on per-entity load-tracking hence FAIR_GROUP_SCHED must be enable. This dependency isn't checked at compile time yet. Change-Id: I5f8e3ee3a1a3253ddce247a858850a29891c17f9 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c6bb1fc215a297..4f6f21cf46528d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4477,6 +4477,11 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 4bd24c5ff8d2be..1b90a408d48331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -80,3 +80,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) */ SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif + +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +SCHED_FEAT(ENERGY_AWARE, false) From d8cd6c04a4be97122f898dc5c9995b2e4cd6e5c8 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:08:45 +0000 Subject: [PATCH 158/420] sched: Introduce energy data structures The struct sched_group_energy represents the per sched_group related data which is needed for energy aware scheduling. It contains: (1) number of elements of the idle state array (2) pointer to the idle state array which comprises 'power consumption' for each idle state (3) number of elements of the capacity state array (4) pointer to the capacity state array which comprises 'compute capacity and power consumption' tuples for each capacity state The struct sched_group obtains a pointer to a struct sched_group_energy. The function pointer sched_domain_energy_f is introduced into struct sched_domain_topology_level which will allow the arch to pass a particular struct sched_group_energy from the topology shim layer into the scheduler core. The function pointer sched_domain_energy_f has an 'int cpu' parameter since the folding of two adjacent sd levels via sd degenerate doesn't work for all sd levels. I.e. it is not possible for example to use this feature to provide per-cpu energy in sd level DIE on ARM's TC2 platform. It was discussed that the folding of sd levels approach is preferable over the cpu parameter approach, simply because the user (the arch specifying the sd topology table) can introduce less errors. But since it is not working, the 'int cpu' parameter is the only way out. It's possible to use the folding of sd levels approach for sched_domain_flags_f and the cpu parameter approach for the sched_domain_energy_f at the same time though. With the use of the 'int cpu' parameter, an extra check function has to be provided to make sure that all cpus spanned by a sched group are provisioned with the same energy data. Change-Id: I334e67c803fbfc956ba339b770e21133a460a8dd cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- include/linux/sched.h | 19 +++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 20 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index cc77b78b162d52..1c7efbd3f13b57 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -910,6 +910,22 @@ struct sched_domain_attr { extern int sched_domain_level_max; +struct capacity_state { + unsigned long cap; /* compute capacity */ + unsigned long power; /* power consumption at this compute capacity */ +}; + +struct idle_state { + unsigned long power; /* power consumption in this idle state */ +}; + +struct sched_group_energy { + unsigned int nr_idle_states; /* number of idle states */ + struct idle_state *idle_states; /* ptr to idle state array */ + unsigned int nr_cap_states; /* number of capacity states */ + struct capacity_state *cap_states; /* ptr to capacity state array */ +}; + struct sched_group; struct sched_domain { @@ -1008,6 +1024,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); +typedef +const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); #define SDTL_OVERLAP 0x01 @@ -1020,6 +1038,7 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; + sched_domain_energy_f energy; int flags; int numa_level; struct sd_data data; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1bb98217b315ee..1ed2adc06b030c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -795,6 +795,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + const struct sched_group_energy const *sge; /* * The CPUs this group covers. From e57c23bd04f89508ff06904435d60a4e60c333e3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:20:20 +0000 Subject: [PATCH 159/420] sched: Initialize energy data structures The sched_group_energy (sge) pointer of the first sched_group (sg) in the sched_domain (sd) is initialized to point to the appropriate (in terms of sd level and cpu) sge data defined in the arch and so to the correct part of the Energy Model (EM). Energy-aware scheduling allows that a system has only EM data up to a certain sd level (so called highest energy aware balancing sd level). A check in init_sched_energy() enforces that all sd's below this sd level contain EM data. The 'int cpu' parameter of sched_domain_energy_f requires that check_sched_energy_data() makes sure that all cpus spanned by a sg are provisioned with the same EM data. This patch has also been tested with feature FORCE_SD_OVERLAP enabled. Change-Id: I39bc0023c8c20e47b92bf56cc12ffb61b3609182 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b06344569f2937..8878937d6daa5c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6050,6 +6050,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +/* + * Check that the per-cpu provided sd energy data is consistent for all cpus + * within the mask. + */ +static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn, + const struct cpumask *cpumask) +{ + const struct sched_group_energy * const sge = fn(cpu); + struct cpumask mask; + int i; + + if (cpumask_weight(cpumask) <= 1) + return; + + cpumask_xor(&mask, cpumask, get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) { + const struct sched_group_energy * const e = fn(i); + int y; + + BUG_ON(e->nr_idle_states != sge->nr_idle_states); + + for (y = 0; y < (e->nr_idle_states); y++) { + BUG_ON(e->idle_states[y].power != + sge->idle_states[y].power); + } + + BUG_ON(e->nr_cap_states != sge->nr_cap_states); + + for (y = 0; y < (e->nr_cap_states); y++) { + BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap); + BUG_ON(e->cap_states[y].power != + sge->cap_states[y].power); + } + } +} + +static void init_sched_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sd->groups)) + return; + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup broken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); + + sd->groups->sge = fn(cpu); +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -6697,10 +6757,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } From 7b75f97fc371932024353d5b58b6bdd83973b8bd Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:50:46 +0000 Subject: [PATCH 160/420] sched: Introduce SD_SHARE_CAP_STATES sched_domain flag cpufreq is currently keeping it a secret which cpus are sharing clock source. The scheduler needs to know about clock domains as well to become more energy aware. The SD_SHARE_CAP_STATES domain flag indicates whether cpus belonging to the sched_domain share capacity states (P-states). There is no connection with cpufreq (yet). The flag must be set by the arch specific topology code. Change-Id: Ief9ebe0c0e51eb55d4f0f0affe2e89731394bf3e cc: Russell King cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- arch/arm/kernel/topology.c | 3 ++- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 89cfdd6e50cb1c..94655b3a741947 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -277,7 +277,8 @@ void store_cpu_topology(unsigned int cpuid) static inline int cpu_corepower_flags(void) { - return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; } static struct sched_domain_topology_level arm_topology[] = { diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c7efbd3f13b57..c23098aebed65d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -878,6 +878,7 @@ enum cpu_idle_type { #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8878937d6daa5c..cf2134063c2e90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5518,7 +5518,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5550,7 +5551,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6216,6 +6218,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6225,7 +6228,8 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) From 6498ce766cb6ca8f4643a4ab6dfbbb2f1fafca93 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 11:53:48 +0100 Subject: [PATCH 161/420] arm64: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Change-Id: Iddc0f792184ad9d3ad6cb1cfc405ff6e3778d194 Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 4 +++- arch/arm64/kernel/topology.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 287c37ec78d500..131bdf4a6e6f3a 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -24,11 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity -struct sched_domain; extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #else diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index b6ee26b0939a8a..e2b0e4aae1672f 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -23,6 +23,18 @@ #include #include +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; + +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -211,6 +223,21 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +static void update_cpu_capacity(unsigned int cpu) +{ + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); + + pr_info("CPU%d: update cpu_capacity %lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); +} + static void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -269,6 +296,7 @@ void store_cpu_topology(unsigned int cpuid) topology_populated: update_siblings_masks(cpuid); + update_cpu_capacity(cpuid); } static void __init reset_cpu_topology(void) From 8b66011237dae6d019bf826456be54be534f508a Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 14:11:28 +0000 Subject: [PATCH 162/420] sched: Compute cpu capacity available at current frequency capacity_orig_of() returns the max available compute capacity of a cpu. For scale-invariant utilization tracking and energy-aware scheduling decisions it is useful to know the compute capacity available at the current OPP of a cpu. Change-Id: I95b37102016184b4252b8f370527cf332bcb5179 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen (am from https://patchwork.kernel.org/patch/7804931/) Signed-off-by: Punit Agrawal --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f6f21cf46528d..4717b3002e09c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4477,6 +4477,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); From 5f2a663c98e13a2f9c39172407573c3b6d4a7eb7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 11 Dec 2014 15:25:29 +0000 Subject: [PATCH 163/420] sched: Relocated cpu_util() and change return type Move cpu_util() to an earlier position in fair.c and change return type to unsigned long as negative usage doesn't make much sense. All other load and capacity related functions use unsigned long including the caller of cpu_util(). Change-Id: Iedd358ff4f5d5723c70ff480ca5d320748f6a05d cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 70 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4717b3002e09c7..3ff9ca2a4ed669 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4488,6 +4488,40 @@ static unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static unsigned long cpu_util(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + return (util >= capacity) ? capacity : util; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); @@ -4614,8 +4648,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static int cpu_util(int cpu); - static inline bool task_fits_spare(struct task_struct *p, int cpu) { return __task_fits(p, cpu, cpu_util(cpu)); @@ -4814,40 +4846,6 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - return (util >= capacity) ? capacity : util; -} - /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, From f5bb8b64868341495c9e1493d219ccfd855f9cdc Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 17:08:52 +0000 Subject: [PATCH 164/420] sched: Highest energy aware balancing sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_ea, points to the highest level at which energy model is provided. At this level and all levels below all sched_groups have energy model data attached. Partial energy model information is possible but restricted to providing energy model data for lower level sched_domains (sd_ea and below) and leaving load-balancing on levels above to non-energy-aware load-balancing. For example, it is possible to apply energy-aware scheduling within each socket on a multi-socket system and let normal scheduling handle load-balancing between sockets. Change-Id: Ia215ae7812afde2f05c6f072476f906f82f4f84f cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 11 ++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf2134063c2e90..f3f18443bce6fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5736,11 +5736,12 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; + struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; @@ -5761,6 +5762,14 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1ed2adc06b030c..b8dc41529e2876 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -771,6 +771,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_ea); struct sched_group_capacity { atomic_t ref; From baa90020bf80c8151fe5ef0f138009d41339b41d Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 18 Dec 2014 14:47:18 +0000 Subject: [PATCH 165/420] sched: Calculate energy consumption of sched_group For energy-aware load-balancing decisions it is necessary to know the energy consumption estimates of groups of cpus. This patch introduces a basic function, sched_group_energy(), which estimates the energy consumption of the cpus in the group and any resources shared by the members of the group. NOTE: The function has five levels of identation and breaks the 80 character limit. Refactoring is necessary. Change-Id: Ia93c66bb272f082044c7b26f5d716cb99fe5bed5 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 4 ++ kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 161 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3f18443bce6fb..4b54eff43e243a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5737,6 +5737,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); static void update_top_cache_domain(int cpu) { @@ -5770,6 +5771,9 @@ static void update_top_cache_domain(int cpu) break; } rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ff9ca2a4ed669..723f4d5ba724c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4527,6 +4527,162 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +/* + * cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +{ + int util = cpu_util(cpu); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static unsigned long group_max_util(struct sched_group *sg) +{ + int i; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(sg)) + max_util = max(max_util, cpu_util(i)); + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +{ + int i; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) + util_sum += cpu_norm_util(i, capacity); + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct sched_group *sg, + const struct sched_group_energy const *sge) +{ + int idx; + unsigned long util = group_max_util(sg); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + return idx; + } + + return idx; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct sched_group *sg_top) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (!sd) + /* + * We most probably raced with hotplug; returning a + * wrong energy estimation is better than entering an + * infinite loop. + */ + return -EINVAL; + + if (sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + struct sched_group *sg_cap_util; + unsigned long group_util; + int sg_busy_energy, sg_idle_energy, cap_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + sg_cap_util = sg_shared_cap; + else + sg_cap_util = sg; + + cap_idx = find_new_capacity(sg_cap_util, sg->sge); + group_util = group_norm_util(sg, cap_idx); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + continue; + } + + return total_energy; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b8dc41529e2876..4f65c36f4b5ae0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -772,6 +772,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_ea); +DECLARE_PER_CPU(struct sched_domain *, sd_scs); struct sched_group_capacity { atomic_t ref; From 03e6eef19bdecbee7d3a709df9e93f3ee66ae339 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 14:21:56 +0000 Subject: [PATCH 166/420] sched: Extend sched_group_energy to test load-balancing decisions Extended sched_group_energy() to support energy prediction with usage (tasks) added/removed from a specific cpu or migrated between a pair of cpus. Useful for load-balancing decision making. Change-Id: I5d583da3d47f13f84084c64012f869cfab14b08e cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 723f4d5ba724c3..02fbc0cccc3e99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4514,12 +4514,21 @@ static unsigned long capacity_curr_of(int cpu) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static unsigned long cpu_util(int cpu) +static unsigned long __cpu_util(int cpu, int delta) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - return (util >= capacity) ? capacity : util; + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); } static inline bool energy_aware(void) @@ -4527,8 +4536,18 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; +}; + /* - * cpu_norm_util() returns the cpu util relative to a specific capacity, + * __cpu_norm_util() returns the cpu util relative to a specific capacity, * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for * energy calculations. Using the scale-invariant util returned by * cpu_util() and approximating scale-invariant util by: @@ -4541,9 +4560,9 @@ static inline bool energy_aware(void) * * norm_util = running_time/time ~ util/capacity */ -static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) { - int util = cpu_util(cpu); + int util = __cpu_util(cpu, delta); if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -4551,13 +4570,25 @@ static unsigned long cpu_norm_util(int cpu, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct sched_group *sg) +static int calc_util_delta(struct energy_env *eenv, int cpu) { - int i; + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; unsigned long max_util = 0; - for_each_cpu(i, sched_group_cpus(sg)) - max_util = max(max_util, cpu_util(i)); + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } return max_util; } @@ -4571,31 +4602,36 @@ static unsigned long group_max_util(struct sched_group *sg) * latter is used as the estimate as it leads to a more pessimistic energy * estimate (more busy). */ -static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i; + int i, delta; unsigned long util_sum = 0; - unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; - for_each_cpu(i, sched_group_cpus(sg)) - util_sum += cpu_norm_util(i, capacity); + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } if (util_sum > SCHED_CAPACITY_SCALE) return SCHED_CAPACITY_SCALE; return util_sum; } -static int find_new_capacity(struct sched_group *sg, +static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy const *sge) { int idx; - unsigned long util = group_max_util(sg); + unsigned long util = group_max_util(eenv); for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) - return idx; + break; } + eenv->cap_idx = idx; + return idx; } @@ -4609,16 +4645,16 @@ static int find_new_capacity(struct sched_group *sg, * This can probably be done in a faster but more complex way. * Note: sched_group_energy() may fail when racing with sched_domain updates. */ -static int sched_group_energy(struct sched_group *sg_top) +static int sched_group_energy(struct energy_env *eenv) { struct sched_domain *sd; int cpu, total_energy = 0; struct cpumask visit_cpus; struct sched_group *sg; - WARN_ON(!sg_top->sge); + WARN_ON(!eenv->sg_top->sge); - cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -4650,17 +4686,16 @@ static int sched_group_energy(struct sched_group *sg_top) break; do { - struct sched_group *sg_cap_util; unsigned long group_util; int sg_busy_energy, sg_idle_energy, cap_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) - sg_cap_util = sg_shared_cap; + eenv->sg_cap = sg_shared_cap; else - sg_cap_util = sg; + eenv->sg_cap = sg; - cap_idx = find_new_capacity(sg_cap_util, sg->sge); - group_util = group_norm_util(sg, cap_idx); + cap_idx = find_new_capacity(eenv, sg->sge); + group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) @@ -4671,7 +4706,7 @@ static int sched_group_energy(struct sched_group *sg_top) if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); - if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); @@ -4680,7 +4715,8 @@ static int sched_group_energy(struct sched_group *sg_top) continue; } - return total_energy; + eenv->energy = total_energy; + return 0; } /* From 88297fadbd3662cc3649da8063a70553186b965f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 6 Jan 2015 17:34:05 +0000 Subject: [PATCH 167/420] sched: Estimate energy impact of scheduling decisions Adds a generic energy-aware helper function, energy_diff(), that calculates energy impact of adding, removing, and migrating utilization in the system. Change-Id: I8b292301ae178d0463ca7293aa031b664d02c8bd cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02fbc0cccc3e99..5ab624cf2e9f99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4719,6 +4719,58 @@ static int sched_group_energy(struct energy_env *eenv) return 0; } +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static int energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + return energy_after-energy_before; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened From f3904dea022d6f547101a720cb7513af8d59c456 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 16:49:57 +0100 Subject: [PATCH 168/420] sched: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% (1024) regardless of how many task additional task we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at it's highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For system where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. Change-Id: Ifced3ecf6bc4d031ed6b366e00537aabcab825c1 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 31 +++++++++++++++++++++++++------ kernel/sched/sched.h | 3 +++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ab624cf2e9f99..635b4420d5def7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3975,6 +3975,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static bool cpu_overutilized(int cpu); + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -3985,6 +3987,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); for_each_sched_entity(se) { if (se->on_rq) @@ -4016,9 +4019,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { add_nr_running(rq, 1); - + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) + rq->rd->overutilized = true; + } hrtick_update(rq); } @@ -6494,11 +6500,12 @@ static enum group_type group_classify(struct lb_env *env, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i, nr_running; @@ -6532,6 +6539,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) + *overutilized = true; } /* Adjust by relative CPU capacity of the group */ @@ -6637,7 +6647,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6659,7 +6669,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6703,8 +6713,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) + env->dst_rq->rd->overutilized = overutilized; + } else { + if (!env->dst_rq->rd->overutilized && overutilized) + env->dst_rq->rd->overutilized = true; + } } /** @@ -8098,6 +8114,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); + + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + rq->rd->overutilized = true; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4f65c36f4b5ae0..f4b32fe3f69a9d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -505,6 +505,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). From 65e9b801d0ea79512936902682d5e371b01fae49 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 27 Jan 2015 13:48:07 +0000 Subject: [PATCH 169/420] sched, cpuidle: Track cpuidle state index in the scheduler The idle-state of each cpu is currently pointed to by rq->idle_state but there isn't any information in the struct cpuidle_state that can used to look up the idle-state energy model data stored in struct sched_group_energy. For this purpose is necessary to store the idle state index as well. Ideally, the idle-state data should be unified. Change-Id: Ib013757dd192b6f6f0be098b3909f42cf2fb723b cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- drivers/cpuidle/cpuidle.c | 6 ++++++ include/linux/cpuidle.h | 3 +++ kernel/sched/idle.c | 10 ++++++++++ kernel/sched/sched.h | 21 +++++++++++++++++++++ 4 files changed, 40 insertions(+) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 125150dc6e81ed..f2d07a03330a9e 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -119,6 +119,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, ktime_t time_start, time_end; s64 diff; + /* Take note of the planned idle state. */ + sched_idle_set_state(target_state, index); + trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ktime_get(); @@ -127,6 +130,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, time_end = ktime_get(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + /* The cpu is no longer idle or about to enter idle. */ + sched_idle_set_state(NULL, -1); + if (!cpuidle_state_is_coupled(dev, drv, entered_state)) local_irq_enable(); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 25e0df6155a4c2..272170116754a8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -182,6 +182,9 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif +/* kernel/sched/idle.c */ +extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index); + #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a); #else diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c47fce75e66648..1c8473949b7804 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -14,6 +14,16 @@ #include "sched.h" +/** + * sched_idle_set_state - Record idle state for the current CPU. + * @idle_state: State to record. + */ +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) +{ + idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); +} + static int __read_mostly cpu_idle_force_poll; void cpu_idle_poll_ctrl(bool enable) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4b32fe3f69a9d..0df9a6c669ac6b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,6 +670,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -1224,6 +1225,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1234,6 +1246,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void sysrq_sched_debug_show(void); From 6647d65060ca11ca4d2ceeee3a9f48d6293f7358 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 27 Jan 2015 14:04:17 +0000 Subject: [PATCH 170/420] sched: Determine the current sched_group idle-state To estimate the energy consumption of a sched_group in sched_group_energy() it is necessary to know which idle-state the group is in when it is idle. For now, it is assumed that this is the current idle-state (though it might be wrong). Based on the individual cpu idle-states group_idle_state() finds the group idle-state. Change-Id: Ic172164eab0b2b28e6a138d09e4d57dc19fccd05 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 635b4420d5def7..d212fa80343d11 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4641,6 +4641,20 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } +static int group_idle_state(struct sched_group *sg) +{ + int i, state = INT_MAX; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + return state; +} + /* * sched_group_energy(): Computes the absolute energy consumption of cpus * belonging to the sched_group including shared resources shared only by @@ -4693,7 +4707,8 @@ static int sched_group_energy(struct energy_env *eenv) do { unsigned long group_util; - int sg_busy_energy, sg_idle_energy, cap_idx; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; @@ -4701,11 +4716,13 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) - >> SCHED_CAPACITY_SHIFT; + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; total_energy += sg_busy_energy + sg_idle_energy; From 3de391f3d1231ddf09efae2b6b826ff6f95c0608 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 20:03:19 +0100 Subject: [PATCH 171/420] sched: Energy-aware wake-up task placement Let available compute capacity and estimated energy impact select wake-up target cpu when energy-aware scheduling is enabled and the system in not over-utilized (above the tipping point). energy_aware_wake_cpu() attempts to find group of cpus with sufficient compute capacity to accommodate the task and find a cpu with enough spare capacity to handle the task within that group. Preference is given to cpus with enough spare capacity at the current OPP. Finally, the energy impact of the new target and the previous task cpu is compared to select the wake-up target cpu. Change-Id: Ia81ebf7ccb0c79bb1569e456ac772589d0be515b cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d212fa80343d11..c1db80244af399 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5113,6 +5113,86 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } +static int energy_aware_wake_cpu(struct task_struct *p, int target) +{ + struct sched_domain *sd; + struct sched_group *sg, *sg_target; + int target_max_cap = INT_MAX; + int target_cpu = task_cpu(p); + int i; + + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + + if (!sd) + return target; + + sg = sd->groups; + sg_target = sg; + + /* + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. + */ + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); + + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + + if (target_cpu != task_cpu(p)) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = task_cpu(p), + .dst_cpu = target_cpu, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(task_cpu(p))) + return target_cpu; + + if (energy_diff(&eenv) >= 0) + return task_cpu(p); + } + + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5138,8 +5218,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + energy_aware(); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5169,7 +5250,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + new_cpu = energy_aware_wake_cpu(p, prev_cpu); + else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); } else while (sd) { From 25c6aa9d3fbe4dcaa61c606af98ce9bd61071c6b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 10 May 2015 15:17:32 +0100 Subject: [PATCH 172/420] sched: Consider a not over-utilized energy-aware system as balanced In case the system operates below the tipping point indicator, introduced in ("sched: Add over-utilization/tipping point indicator"), bail out in find_busiest_group after the dst and src group statistics have been checked. There is simply no need to move usage around because all involved cpus still have spare cycles available. For an energy-aware system below its tipping point, we rely on the task placement of the wakeup path. This works well for short running tasks. The existence of long running tasks on one of the involved cpus lets the system operate over its tipping point. To be able to move such a task (whose load can't be used to average the load among the cpus) from a src cpu with lower capacity than the dst_cpu, an additional rule has to be implemented in need_active_balance. Change-Id: I02aa66bcdd10659ff2c748239fd8d22c218368e6 Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1db80244af399..adc9a769d5ec04 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7041,6 +7041,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; From 99f1185f792b1894ad5a23a8a38f0eecaf838344 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 3 Feb 2015 13:54:11 +0000 Subject: [PATCH 173/420] sched: Disable energy-unfriendly nohz kicks With energy-aware scheduling enabled nohz_kick_needed() generates many nohz idle-balance kicks which lead to nothing when multiple tasks get packed on a single cpu to save energy. This causes unnecessary wake-ups and hence wastes energy. Make these conditions depend on !energy_aware() for now until the energy-aware nohz story gets sorted out. Change-Id: Ieda9ec81c01795f8245c0dd3eb39ea42dba0fa32 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index adc9a769d5ec04..8d94def6893648 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8109,12 +8109,13 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 7753e552ca707d421adff80ac380ae29240f2ebb Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2015 16:47:48 +0100 Subject: [PATCH 174/420] cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support Implements cpufreq_scale_max_freq_capacity() to provide the scheduler with a maximum frequency scaling correction factor for more accurate load-tracking and cpu capacity handling by being able to deal with frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency lower than the absolute possible maximum frequency on load tracking and cpu capacity. The factor is: current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, max_freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for max_freq_scale. Change-Id: I5bcf45a208deecd5d8d063adb9e1f06bda44ce98 Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 19 +++++++++++++++++++ include/linux/cpufreq.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b12a7c2fc34292..d1d9d60cfb9449 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -283,12 +283,14 @@ static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) *********************************************************************/ static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { unsigned long cur = freqs ? freqs->new : policy->cur; unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo; int cpu; pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", @@ -296,6 +298,18 @@ scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) for_each_cpu(cpu, policy->cpus) per_cpu(freq_scale, cpu) = scale; + + if (freqs) + return; + + scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq; + + pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n", + cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq, + scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(max_freq_scale, cpu) = scale; } unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) @@ -303,6 +317,11 @@ unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) return per_cpu(freq_scale, cpu); } +unsigned long cpufreq_scale_max_freq_capacity(int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 640c6794e311c8..b82824792b7c53 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -611,4 +611,5 @@ void acct_update_power(struct task_struct *p, cputime_t cputime); struct sched_domain; unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif /* _LINUX_CPUFREQ_H */ From f234d42266bc927fc7903ca2496f596e1bad119a Mon Sep 17 00:00:00 2001 From: Joseph Lo Date: Mon, 22 Apr 2013 14:39:18 +0800 Subject: [PATCH 175/420] CHROMIUM: sched: update the average of nr_running Doing a Exponential moving average per nr_running++/-- does not guarantee a fixed sample rate which induces errors if there are lots of threads being enqueued/dequeued from the rq (Linpack mt). Instead of keeping track of the avg, the scheduler now keeps track of the integral of nr_running and allows the readers to perform filtering on top. Original-author: Sai Charan Gurrappadi Change-Id: Id946654f32fa8be0eaf9d8fa7c9a8039b5ef9fab Signed-off-by: Joseph Lo Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/174694 Reviewed-on: https://chromium-review.googlesource.com/272853 --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 30 +++++++++++++++++++++++++++ kernel/sched/sched.h | 48 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c23098aebed65d..4643d4bc648aef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -172,6 +172,9 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); +#ifdef CONFIG_CPU_QUIET +extern u64 nr_running_integral(unsigned int cpu); +#endif extern void calc_global_load(unsigned long ticks); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b54eff43e243a..536419036af6d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2637,6 +2637,36 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +#ifdef CONFIG_CPU_QUIET +u64 nr_running_integral(unsigned int cpu) +{ + unsigned int seqcnt; + u64 integral; + struct rq *q; + + if (cpu >= nr_cpu_ids) + return 0; + + q = cpu_rq(cpu); + + /* + * Update average to avoid reading stalled value if there were + * no run-queue changes for a long time. On the other hand if + * the changes are happening right now, just read current value + * directly. + */ + + seqcnt = read_seqcount_begin(&q->ave_seqcnt); + integral = do_nr_running_integral(q); + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) { + read_seqcount_begin(&q->ave_seqcnt); + integral = q->nr_running_integral; + } + + return integral; +} +#endif + void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0df9a6c669ac6b..93e1abc799724c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -564,6 +564,13 @@ struct rq { #endif int skip_clock_update; +#ifdef CONFIG_CPU_QUIET + /* time-based average load */ + u64 nr_last_stamp; + u64 nr_running_integral; + seqcount_t ave_seqcnt; +#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -1280,7 +1287,7 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -static inline void add_nr_running(struct rq *rq, unsigned count) +static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1308,11 +1315,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } } -static inline void sub_nr_running(struct rq *rq, unsigned count) +static inline void __sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; } +#ifdef CONFIG_CPU_QUIET +#define NR_AVE_SCALE(x) ((x) << FSHIFT) +static inline u64 do_nr_running_integral(struct rq *rq) +{ + s64 nr, deltax; + u64 nr_running_integral = rq->nr_running_integral; + + deltax = rq->clock_task - rq->nr_last_stamp; + nr = NR_AVE_SCALE(rq->nr_running); + + nr_running_integral += nr * deltax; + + return nr_running_integral; +} + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __add_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __sub_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} +#else +#define add_nr_running __add_nr_running +#define sub_nr_running __sub_nr_running +#endif + static inline void rq_last_tick_reset(struct rq *rq) { #ifdef CONFIG_NO_HZ_FULL From ef4c3561f8849cd42ddc1c8f2c7fc9c850d9b224 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 176/420] sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. There is also an outstanding question: Should the cpu capacity of a cpu going idle be set to a very small value? Change-Id: I6d4afc21bd4b3594f0b2bc63450834bb907ea1df Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++------ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 536419036af6d1..0e93948ab43be9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5661,6 +5661,8 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; + + init_max_cpu_capacity(&rd->max_cpu_capacity); return 0; free_rto_mask: @@ -6822,15 +6824,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); - - if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) - rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); - if (rq) - pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8d94def6893648..a52a9b12bc11f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4904,7 +4904,7 @@ static inline bool __task_fits(struct task_struct *p, int cpu, int util) static inline bool task_fits_max(struct task_struct *p, int cpu) { unsigned long capacity = capacity_of(cpu); - unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; if (capacity == max_capacity) return true; @@ -6407,13 +6407,43 @@ static unsigned long scale_rt_capacity(int cpu) return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 93e1abc799724c..6f74d9e19ce423 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -487,6 +487,12 @@ struct dl_rq { #ifdef CONFIG_SMP +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -525,7 +531,7 @@ struct root_domain { struct cpupri cpupri; /* Maximum cpu capacity in the system. */ - unsigned long max_cpu_capacity; + struct max_cpu_capacity max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -1287,6 +1293,8 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); + static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From eb7522e891472c45ebf94a0f491467a4db284b02 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:34:15 +0100 Subject: [PATCH 177/420] arm64: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_max_freq_capacity() has to be declared extern in topology.h. Change-Id: I2c25fbc7748865651975ba212e1d9b2976d8f7d6 Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 1 + arch/arm64/kernel/topology.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 131bdf4a6e6f3a..4ea11bad40025b 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -28,6 +28,7 @@ struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +extern unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif #define arch_scale_cpu_capacity scale_cpu_capacity extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index e2b0e4aae1672f..fdaf17c0625100 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -27,7 +27,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#ifdef CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From 7783c63b123c6ffbd412398e5522c2fcff669312 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 13 Jan 2016 15:49:44 +0000 Subject: [PATCH 178/420] sched: Do eas idle balance regardless of the rq avg idle value EAS relies on idle balance to migrate a misfit task towards a cpu with higher capacity. When such a cpu becomes idle, idle balance should happen even if the rq avg idle is smaller than the sched migration cost (default 500us). The rq avg idle is updated during the wakeup of a task in case the rq has a non-null idle_stamp. This value stays unchanged and valid until the next task wakes up on this cpu after an idle period. So rq avg idle could be smaller than sched migration cost preventing the idle balance from happening. In this case we would be at the mercy of wakeup, periodic or nohz-idle load balancing to put another task on this cpu. To break this dependency towards rq avg idle make EAS idle balance independent from this rq avg idle has to be larger than sched migration cost. Change-Id: I880a25180062444d72947461d976dc44f9672f13 Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a52a9b12bc11f5..ece04ba8c04d7d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7619,8 +7619,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) From 2772ed24491e97597d98ccb4c2d2df073fff4314 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 13 Jan 2016 16:09:26 +0000 Subject: [PATCH 179/420] sched: Add per-cpu max capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all cpus in the sched_group. Unless it is divided by the group_weight to get the average capacity per cpu it hides differences in cpu capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers cpus of different capacities. Instead, by extending struct sched_group_capacity to indicate max per-cpu capacity in the group a suitable group for a given task utilization can easily be found such that cpus with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Change-Id: I3ad0e6df855b1a184db05cb310e91e1e03061467 Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 17 ++++++++++++----- kernel/sched/sched.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0e93948ab43be9..38b9ab3a1352b8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5487,7 +5487,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %s", str); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -5969,6 +5969,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ece04ba8c04d7d..15bb7fb5d2ebb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6452,13 +6452,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6471,6 +6472,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6495,11 +6497,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); } } else { /* @@ -6509,12 +6512,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6f74d9e19ce423..bcf13796303b8c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -797,7 +797,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long max_capacity; /* Max per-cpu capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From 36fc941afc2b22f1821c6295303e4a089d77027a Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 13 Jan 2016 16:38:52 +0000 Subject: [PATCH 180/420] sched: Add group_misfit_task load-balance type To maximize throughput in systems with reduced capacity cpus (e.g. high RT/IRQ load and/or ARM big.LITTLE) load-balancing has to consider task and cpu utilization as well as per-cpu compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks that are scheduled on a reduced capacity cpu need to be identified and migrated to a higher capacity cpu if possible. To implement this additional policy an additional group_type (load-balance scenario) is added: group_misfit_task. This represents scenarios where a sched_group has tasks that are not suitable for its per-cpu capacity. group_misfit_task is only considered if the system is not overloaded in any other way (group_imbalanced or group_overloaded). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Change-Id: I092a348ed0ff37eae123f0d8d6dcf1435d51bfb1 Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 30 +++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 15bb7fb5d2ebb3..de92c33d1864e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5564,6 +5564,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5585,9 +5587,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; + new_tasks = idle_balance(rq); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is @@ -5792,6 +5798,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6289,12 +6302,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6310,6 +6317,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6626,6 +6634,9 @@ static enum group_type group_classify(struct lb_env *env, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } @@ -6677,8 +6688,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) + if (cpu_overutilized(i)) { *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -8260,6 +8274,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; + + rq->misfit_task = !task_fits_max(curr, rq->cpu); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bcf13796303b8c..c5d1981d17da71 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -561,6 +561,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; + unsigned int misfit_task; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; From 53f0fb16c97ea2ad68fc25a39ca4817264881432 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 28 Jul 2015 15:42:47 +0100 Subject: [PATCH 181/420] sched: Consider misfit tasks when load-balancing With the new group_misfit_task load-balancing scenario additional policy conditions are needed when load-balancing. Misfit task balancing only makes sense between source group with lower capacity than the target group. If capacities are the same, fallback to normal group_other balancing. The aim is to balance tasks such that no task has its throughput hindered by compute capacity if a cpu with more capacity is available. Load-balancing is generally based on average load in the sched_groups, but for misfitting tasks it is necessary to introduce exceptions to migrate tasks against usual metrics and optimize throughput. This patch ensures the following load-balance for mixed capacity systems (e.g. ARM big.LITTLE) for always-running tasks: 1. Place a task on each cpu starting in order from cpus with highest capacity to lowest until all cpus are in use (i.e. one task on each cpu). 2. Once all cpus are in use balance according to compute capacity such that load per capacity is approximately the same regardless of the compute capacity (i.e. big cpus get more tasks than little cpus). Necessary changes are introduced in find_busiest_group(), calculate_imbalance(), and find_busiest_queue(). This includes passing the group_type on to find_busiest_queue() through struct lb_env, which is currently only considers imbalance and not the imbalance situation (group_type). To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Change-Id: I458461cebf269d6d4eeac6f83e4c84f4e4d7a9dd Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index de92c33d1864e6..019060c69247d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5834,6 +5834,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -6624,6 +6625,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static enum group_type group_classify(struct lb_env *env, struct sched_group *group, struct sg_lb_stats *sgs) @@ -6734,9 +6747,25 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + /* + * Candiate sg has no more than one task per cpu and has higher + * per-cpu capacity. No reason to pull tasks to less capable cpus. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6842,6 +6871,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_overloaded; } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -7018,6 +7056,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7051,6 +7105,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7124,6 +7183,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7147,7 +7211,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -7160,6 +7225,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -7218,7 +7284,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* From 457e82e74fa94cdb5503fd532afdad20841d4a63 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 17:56:20 +0100 Subject: [PATCH 182/420] Documentation: DT bindings for energy model cost data required by EAS EAS (energy aware scheduling) provides the scheduler with an alternative objective - energy efficiency - as opposed to it's current performance oriented objectives. EAS relies on a simple platform energy cost model to guide scheduling decisions. The model only considers the CPU subsystem. This patch adds documentation describing DT bindings that should be used to supply the scheduler with an energy cost model. Change-Id: I312c8d2f46d3aed0b8f39bd6e4f1739699bc5944 Signed-off-by: Robin Randhawa --- .../bindings/scheduler/sched-energy-costs.txt | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt new file mode 100644 index 00000000000000..11216f09e596b2 --- /dev/null +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -0,0 +1,360 @@ +=========================================================== +Energy cost bindings for Energy Aware Scheduling +=========================================================== + +=========================================================== +1 - Introduction +=========================================================== + +This note specifies bindings required for energy-aware scheduling +(EAS)[1]. Historically, the scheduler's primary objective has been +performance. EAS aims to provide an alternative objective - energy +efficiency. EAS relies on a simple platform energy cost model to +guide scheduling decisions. The model only considers the CPU +subsystem. + +This note is aligned with the definition of the layout of physical +CPUs in the system as described in the ARM topology binding +description [2]. The concept is applicable to any system so long as +the cost model data is provided for those processing elements in +that system's topology that EAS is required to service. + +Processing elements refer to hardware threads, CPUs and clusters of +related CPUs in increasing order of hierarchy. + +EAS requires two key cost metrics - busy costs and idle costs. Busy +costs comprise of a list of compute capacities for the processing +element in question and the corresponding power consumption at that +capacity. Idle costs comprise of a list of power consumption values +for each idle state [C-state] that the processing element supports. +For a detailed description of these metrics, their derivation and +their use see [3]. + +These cost metrics are required for processing elements in all +scheduling domain levels that EAS is required to service. + +=========================================================== +2 - energy-costs node +=========================================================== + +Energy costs for the processing elements in scheduling domains that +EAS is required to service are defined in the energy-costs node +which acts as a container for the actual per processing element cost +nodes. A single energy-costs node is required for a given system. + +- energy-costs node + + Usage: Required + + Description: The energy-costs node is a container node and + it's sub-nodes describe costs for each processing element at + all scheduling domain levels that EAS is required to + service. + + Node name must be "energy-costs". + + The energy-costs node's parent node must be the cpus node. + + The energy-costs node's child nodes can be: + + - one or more cost nodes. + + Any other configuration is considered invalid. + +The energy-costs node can only contain a single type of child node +whose bindings are described in paragraph 4. + +=========================================================== +3 - energy-costs node child nodes naming convention +=========================================================== + +energy-costs child nodes must follow a naming convention where the +node name must be "thread-costN", "core-costN", "cluster-costN" +depending on whether the costs in the node are for a thread, core or +cluster. N (where N = {0, 1, ...}) is the node number and has no +bearing to the OS' logical thread, core or cluster index. + +=========================================================== +4 - cost node bindings +=========================================================== + +Bindings for cost nodes are defined as follows: + +- cluster-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple clusters and each cluster + serviced by EAS must have a corresponding cluster-costs + node. + + The cluster-cost node name must be "cluster-costN" as + described in 3 above. + + A cluster-cost node must be a leaf node with no children. + + Properties for cluster-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- core-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple cores and each core serviced by + EAS must have a corresponding core-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for core-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- thread-cost node + + Description: must be declared within an energy-costs node. A + system can contain cores with multiple hardware threads and + each thread serviced by EAS must have a corresponding + thread-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for thread-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +=========================================================== +5 - Cost node properties +========================================================== + +All cost node types must have only the following properties: + +- busy-cost-data + + Usage: required + Value type: An array of 2-item tuples. Each item is of type + u32. + Definition: The first item in the tuple is the capacity + value as described in [3]. The second item in the tuple is + the energy cost value as described in [3]. + +- idle-cost-data + + Usage: required + Value type: An array of 1-item tuples. The item is of type + u32. + Definition: The item in the tuple is the energy cost value + as described in [3]. + +=========================================================== +4 - Extensions to the cpu node +=========================================================== + +The cpu node is extended with a property that establishes the +connection between the processing element represented by the cpu +node and the cost-nodes associated with this processing element. + +The connection is expressed in line with the topological hierarchy +that this processing element belongs to starting with the level in +the hierarchy that this processing element itself belongs to through +to the highest level that EAS is required to service. The +connection cannot be sparse and must be contiguous from the +processing element's level through to the highest desired level. The +highest desired level must be the same for all processing elements. + +Example: Given that a cpu node may represent a thread that is a part +of a core, this property may contain multiple elements which +associate the thread with cost nodes describing the costs for the +thread itself, the core the thread belongs to, the cluster the core +belongs to and so on. The elements must be ordered from the lowest +level nodes to the highest desired level that EAS must service. The +highest desired level must be the same for all cpu nodes. The +elements must not be sparse: there must be elements for the current +thread, the next level of hierarchy (core) and so on without any +'holes'. + +Example: Given that a cpu node may represent a core that is a part +of a cluster of related cpus this property may contain multiple +elements which associate the core with cost nodes describing the +costs for the core itself, the cluster the core belongs to and so +on. The elements must be ordered from the lowest level nodes to the +highest desired level that EAS must service. The highest desired +level must be the same for all cpu nodes. The elements must not be +sparse: there must be elements for the current thread, the next +level of hierarchy (core) and so on without any 'holes'. + +If the system comprises of hierarchical clusters of clusters, this +property will contain multiple associations with the relevant number +of cluster elements in hierarchical order. + +Property added to the cpu node: + +- sched-energy-costs + + Usage: required + Value type: List of phandles + Definition: a list of phandles to specific cost nodes in the + energy-costs parent node that correspond to the processing + element represented by this cpu node in hierarchical order + of topology. + + The order of phandles in the list is significant. The first + phandle is to the current processing element's own cost + node. Subsequent phandles are to higher hierarchical level + cost nodes up until the maximum level that EAS is to + service. + + All cpu nodes must have the same highest level cost node. + + The phandle list must not be sparsely populated with handles + to non-contiguous hierarchical levels. See commentary above + for clarity. + + Any other configuration is invalid. + +=========================================================== +5 - Example dts +=========================================================== + +Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one +cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus): + +cpus { + #address-cells = <2>; + #size-cells = <0>; + . + . + . + A57_0: cpu@0 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x0>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A57_1: cpu@1 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x1>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A53_0: cpu@100 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x100>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_1: cpu@101 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x101>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_2: cpu@102 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x102>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_3: cpu@103 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x103>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + energy-costs { + CPU_COST_0: core-cost0 { + busy-cost-data = < + 417 168 + 579 251 + 744 359 + 883 479 + 1024 616 + >; + idle-cost-data = < + 15 + 0 + >; + }; + CPU_COST_1: core-cost1 { + busy-cost-data = < + 235 33 + 302 46 + 368 61 + 406 76 + 447 93 + >; + idle-cost-data = < + 6 + 0 + >; + }; + CLUSTER_COST_0: cluster-cost0 { + busy-cost-data = < + 417 24 + 579 32 + 744 43 + 883 49 + 1024 64 + >; + idle-cost-data = < + 65 + 24 + >; + }; + CLUSTER_COST_1: cluster-cost1 { + busy-cost-data = < + 235 26 + 303 30 + 368 39 + 406 47 + 447 57 + >; + idle-cost-data = < + 56 + 17 + >; + }; + }; +}; + +=============================================================================== +[1] https://lkml.org/lkml/2015/5/12/728 +[2] Documentation/devicetree/bindings/topology.txt +[3] Documentation/scheduler/sched-energy.txt From 5a640356c3f8b28e42c583b0bbe76eb61b466749 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 18:01:58 +0100 Subject: [PATCH 183/420] sched: Support for extracting EAS energy costs from DT This patch implements support for extracting energy cost data from DT. The data should conform to the DT bindings for energy cost data needed by EAS (energy aware scheduling). Change-Id: Ia435bd4d4b111bb6257ffb2f5385b5f4b70d5aa6 Signed-off-by: Robin Randhawa --- include/linux/sched_energy.h | 36 ++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/energy.c | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched_energy.h create mode 100644 kernel/sched/energy.c diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h new file mode 100644 index 00000000000000..a3f1627ac609e0 --- /dev/null +++ b/include/linux/sched_energy.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_SCHED_ENERGY_H +#define _LINUX_SCHED_ENERGY_H + +#include +#include + +/* + * There doesn't seem to be an NR_CPUS style max number of sched domain + * levels so here's an arbitrary constant one for the moment. + * + * The levels alluded to here correspond to entries in struct + * sched_domain_topology_level that are meant to be populated by arch + * specific code (topology.c). + */ +#define NR_SD_LEVELS 8 + +#define SD_LEVEL0 0 +#define SD_LEVEL1 1 +#define SD_LEVEL2 2 +#define SD_LEVEL3 3 +#define SD_LEVEL4 4 +#define SD_LEVEL5 5 +#define SD_LEVEL6 6 +#define SD_LEVEL7 7 + +/* + * Convenience macro for iterating through said sd levels. + */ +#define for_each_possible_sd_level(level) \ + for (level = 0; level < NR_SD_LEVELS; level++) + +extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +void init_sched_energy_costs(void); + +#endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 4b6ceef49530c6..1baa3f1be29b01 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c new file mode 100644 index 00000000000000..b0656b7a93e337 --- /dev/null +++ b/kernel/sched/energy.c @@ -0,0 +1,124 @@ +/* + * Obtain energy cost data from DT and populate relevant scheduler data + * structures. + * + * Copyright (C) 2015 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define pr_fmt(fmt) "sched-energy: " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include + +struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +static void free_resources(void) +{ + int cpu, sd_level; + struct sched_group_energy *sge; + + for_each_possible_cpu(cpu) { + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (sge) { + kfree(sge->cap_states); + kfree(sge->idle_states); + kfree(sge); + } + } + } +} + +void init_sched_energy_costs(void) +{ + struct device_node *cn, *cp; + struct capacity_state *cap_states; + struct idle_state *idle_states; + struct sched_group_energy *sge; + const struct property *prop; + int sd_level, i, nstates, cpu; + const __be32 *val; + + for_each_possible_cpu(cpu) { + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_warn("CPU device node missing for CPU %d\n", cpu); + return; + } + + if (!of_find_property(cn, "sched-energy-costs", NULL)) { + pr_warn("CPU device node has no sched-energy-costs\n"); + return; + } + + for_each_possible_sd_level(sd_level) { + cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); + if (!cp) + break; + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No busy-cost data, skipping sched_energy init\n"); + goto out; + } + + sge = kcalloc(1, sizeof(struct sched_group_energy), + GFP_NOWAIT); + + nstates = (prop->length / sizeof(u32)) / 2; + cap_states = kcalloc(nstates, + sizeof(struct capacity_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) { + cap_states[i].cap = be32_to_cpup(val++); + cap_states[i].power = be32_to_cpup(val++); + } + + sge->nr_cap_states = nstates; + sge->cap_states = cap_states; + + prop = of_find_property(cp, "idle-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No idle-cost data, skipping sched_energy init\n"); + goto out; + } + + nstates = (prop->length / sizeof(u32)); + idle_states = kcalloc(nstates, + sizeof(struct idle_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) + idle_states[i].power = be32_to_cpup(val++); + + sge->nr_idle_states = nstates; + sge->idle_states = idle_states; + + sge_array[cpu][sd_level] = sge; + } + } + + pr_info("Sched-energy-costs installed from DT\n"); + return; + +out: + free_resources(); +} From 854e1e7f9f564cc1953fe539df3c4ade0483d47c Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Tue, 9 Jun 2015 15:10:00 +0100 Subject: [PATCH 184/420] arm64, topology: Updates to use DT bindings for EAS costing data With the bindings and the associated accessors to extract data from the bindings in place, remove the static hard-coded data from topology.c and use the accesors instead. Change-Id: Id2e68b26a5a7b33ec0b3dba8779bf1a2451c4abe Signed-off-by: Robin Randhawa --- arch/arm64/kernel/topology.c | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fdaf17c0625100..2f98601318c38d 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -224,11 +226,52 @@ static int __init parse_dt_topology(void) struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu); + return NULL; + } + + return sge; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for CPU%d\n", cpu); + return NULL; + } + + return sge; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; } +static inline int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; +} + +static struct sched_domain_topology_level arm64_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static void update_cpu_capacity(unsigned int cpu) { unsigned long capacity = SCHED_CAPACITY_SCALE; @@ -333,4 +376,8 @@ void __init init_cpu_topology(void) */ if (parse_dt_topology()) reset_cpu_topology(); + else + set_sched_topology(arm64_topology); + + init_sched_energy_costs(); } From 269d6b1d96ac1654a959f49191d30f95b22b7d81 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 12:47:48 +0100 Subject: [PATCH 185/420] arm: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Change-Id: I7b82e9935533aaf2f24903bb05e7f0b1bcdb6a88 Signed-off-by: Dietmar Eggemann --- arch/arm/include/asm/topology.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85fff5ccacd..6b0b85455a02a0 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,11 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#include +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +#endif + #else static inline void init_cpu_topology(void) { } From 477fc2d08492fa39dee68315d3c2f8ca21f8c8c1 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 14 Apr 2015 16:25:31 +0100 Subject: [PATCH 186/420] arm: Update arch_scale_cpu_capacity() to reflect change to define arch_scale_cpu_capacity() is no longer a weak function but a #define instead. Include the #define in topology.h. Change-Id: I372bd5e4c1e203428d72b18c8a806b06f3567ef6 cc: Russell King Signed-off-by: Morten Rasmussen --- arch/arm/include/asm/topology.h | 2 ++ arch/arm/kernel/topology.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 6b0b85455a02a0..94d3265019cb5c 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -28,6 +28,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #else diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 94655b3a741947..8e03494ab1e80f 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } From cd2a2ec242cd7e7066c13aef220ec5553510901e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 17:16:41 +0000 Subject: [PATCH 187/420] arm: topology: Define TC2 energy and provide it to the scheduler This patch is only here to be able to test provisioning of energy related data from an arch topology shim layer to the scheduler. Since there is no code today which deals with extracting energy related data from the dtb or acpi, and process it in the topology shim layer, the content of the sched_group_energy structures as well as the idle_state and capacity_state arrays are hard-coded here. This patch defines the sched_group_energy structure as well as the idle_state and capacity_state array for the cluster (relates to sched groups (sgs) in DIE sched domain level) and for the core (relates to sgs in MC sd level) for a Cortex A7 as well as for a Cortex A15. It further provides related implementations of the sched_domain_energy_f functions (cpu_cluster_energy() and cpu_core_energy()). To be able to propagate this information from the topology shim layer to the scheduler, the elements of the arm_topology[] table have been provisioned with the appropriate sched_domain_energy_f functions. Change-Id: I8c014bbd04f6a1d57892be9bfa16affe07948dcf cc: Russell King Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 126 ++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 8e03494ab1e80f..5429523ac62726 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -275,6 +275,127 @@ void store_cpu_topology(unsigned int cpuid) cpu_topology[cpuid].socket_id, mpidr); } +/* + * ARM TC2 specific energy cost model data. There are no unit requirements for + * the data. Data can be normalized to any reference point, but the + * normalization must be consistent. That is, one bogo-joule/watt must be the + * same quantity for all data, but we don't care what it is. + */ +static struct idle_state idle_states_cluster_a7[] = { + { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 25 }, /* WFI */ + { .power = 10 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_cluster_a15[] = { + { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 70 }, /* WFI */ + { .power = 25 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_cluster_a7[] = { + /* Cluster only power */ + { .cap = 150, .power = 2967, }, /* 350 MHz */ + { .cap = 172, .power = 2792, }, /* 400 MHz */ + { .cap = 215, .power = 2810, }, /* 500 MHz */ + { .cap = 258, .power = 2815, }, /* 600 MHz */ + { .cap = 301, .power = 2919, }, /* 700 MHz */ + { .cap = 344, .power = 2847, }, /* 800 MHz */ + { .cap = 387, .power = 3917, }, /* 900 MHz */ + { .cap = 430, .power = 4905, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_cluster_a15[] = { + /* Cluster only power */ + { .cap = 426, .power = 7920, }, /* 500 MHz */ + { .cap = 512, .power = 8165, }, /* 600 MHz */ + { .cap = 597, .power = 8172, }, /* 700 MHz */ + { .cap = 682, .power = 8195, }, /* 800 MHz */ + { .cap = 768, .power = 8265, }, /* 900 MHz */ + { .cap = 853, .power = 8446, }, /* 1000 MHz */ + { .cap = 938, .power = 11426, }, /* 1100 MHz */ + { .cap = 1024, .power = 15200, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_cluster_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7), + .idle_states = idle_states_cluster_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7), + .cap_states = cap_states_cluster_a7, +}; + +static struct sched_group_energy energy_cluster_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15), + .idle_states = idle_states_cluster_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15), + .cap_states = cap_states_cluster_a15, +}; + +static struct idle_state idle_states_core_a7[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_core_a15[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_core_a7[] = { + /* Power per cpu */ + { .cap = 150, .power = 187, }, /* 350 MHz */ + { .cap = 172, .power = 275, }, /* 400 MHz */ + { .cap = 215, .power = 334, }, /* 500 MHz */ + { .cap = 258, .power = 407, }, /* 600 MHz */ + { .cap = 301, .power = 447, }, /* 700 MHz */ + { .cap = 344, .power = 549, }, /* 800 MHz */ + { .cap = 387, .power = 761, }, /* 900 MHz */ + { .cap = 430, .power = 1024, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_core_a15[] = { + /* Power per cpu */ + { .cap = 426, .power = 2021, }, /* 500 MHz */ + { .cap = 512, .power = 2312, }, /* 600 MHz */ + { .cap = 597, .power = 2756, }, /* 700 MHz */ + { .cap = 682, .power = 3125, }, /* 800 MHz */ + { .cap = 768, .power = 3524, }, /* 900 MHz */ + { .cap = 853, .power = 3846, }, /* 1000 MHz */ + { .cap = 938, .power = 5177, }, /* 1100 MHz */ + { .cap = 1024, .power = 6997, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_core_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a7), + .idle_states = idle_states_core_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a7), + .cap_states = cap_states_core_a7, +}; + +static struct sched_group_energy energy_core_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a15), + .idle_states = idle_states_core_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a15), + .cap_states = cap_states_core_a15, +}; + +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_cluster_a7 : + &energy_cluster_a15; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_core_a7 : + &energy_core_a15; +} + static inline int cpu_corepower_flags(void) { return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ @@ -283,10 +404,9 @@ static inline int cpu_corepower_flags(void) static struct sched_domain_topology_level arm_topology[] = { #ifdef CONFIG_SCHED_MC - { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, { NULL, }, }; From f7b35b6407fee3b48d005e58f2dbd27457c56c5d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 10 Jul 2015 13:57:19 +0100 Subject: [PATCH 188/420] arm: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used instead of the arm arch specific cpu_efficiency and dtb property 'clock-frequency' values as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Change-Id: I7588caa23bf3603eeb42223258fcc7e31182de2a Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 5429523ac62726..b2bbedeca6bf4f 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -153,6 +153,8 @@ static void __init parse_dt_topology(void) } +static const struct sched_group_energy * const cpu_core_energy(int cpu); + /* * Look for a customed capacity of a CPU in the cpu_capacity table during the * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the @@ -160,10 +162,14 @@ static void __init parse_dt_topology(void) */ static void update_cpu_capacity(unsigned int cpu) { - if (!cpu_capacity(cpu)) - return; + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } - set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); + set_capacity_scale(cpu, capacity); printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); From 4e16e4f3598a0f32fddb8a7f8ee9f9cb85f5f780 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 17:59:55 +0100 Subject: [PATCH 189/420] arm: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Change-Id: If5e9e0ba8ff5a5d3236b373dbce8c72ea71b5e18 Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index b2bbedeca6bf4f..18ea2fd925fa56 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#if CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From 57a409f63a3355e53ca4b1fcd6f265a895250827 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:27 +0100 Subject: [PATCH 190/420] cpufreq: introduce cpufreq_driver_is_slow Some architectures and platforms perform CPU frequency transitions through a non-blocking method, while some might block or sleep. Even when frequency transitions do not block or sleep they may be very slow. This distinction is important when trying to change frequency from a non-interruptible context in a scheduler hot path. Describe this distinction with a cpufreq driver flag, CPUFREQ_DRIVER_FAST. The default is to not have this flag set, thus erring on the side of caution. cpufreq_driver_is_slow() is also introduced in this patch. Setting the above flag will allow this function to return false. [smuckle@linaro.org: change flag/API to include drivers that are too slow for scheduler hot paths, in addition to those that block/sleep] Change-Id: I6f1d929ee175da71d2939c00bded8aacbca37ec5 Cc: Rafael J. Wysocki Cc: Viresh Kumar Signed-off-by: Michael Turquette Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804821/) Signed-off-by: Punit Agrawal --- drivers/cpufreq/cpufreq.c | 6 ++++++ include/linux/cpufreq.h | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d1d9d60cfb9449..1e321ca24f252b 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -102,6 +102,12 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +bool cpufreq_driver_is_slow(void) +{ + return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST); +} +EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b82824792b7c53..c055a7938c2b02 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -158,6 +158,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_driver_is_slow(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else static inline unsigned int cpufreq_get(unsigned int cpu) @@ -311,6 +312,14 @@ struct cpufreq_driver { */ #define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5) +/* + * Indicates that it is safe to call cpufreq_driver_target from + * non-interruptable context in scheduler hot paths. Drivers must + * opt-in to this flag, as the safe default is that they might sleep + * or be too slow for hot path use. + */ +#define CPUFREQ_DRIVER_FAST (1 << 6) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From be4260a97dc5e732f96700db3ebc3902783125c3 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:48 +0100 Subject: [PATCH 191/420] sched: scheduler-driven cpu frequency selection Scheduler-driven CPU frequency selection hopes to exploit both per-task and global information in the scheduler to improve frequency selection policy, achieving lower power consumption, improved responsiveness/performance, and less reliance on heuristics and tunables. For further discussion on the motivation of this integration see [0]. This patch implements a shim layer between the Linux scheduler and the cpufreq subsystem. The interface accepts capacity requests from the CFS, RT and deadline sched classes. The requests from each sched class are summed on each CPU with a margin applied to the CFS and RT capacity requests to provide some headroom. Deadline requests are expected to be precise enough given their nature to not require headroom. The maximum total capacity request for a CPU in a frequency domain drives the requested frequency for that domain. Policy is determined by both the sched classes and this shim layer. Note that this algorithm is event-driven. There is no polling loop to check cpu idle time nor any other method which is unsynchronized with the scheduler, aside from a throttling mechanism to ensure frequency changes are not attempted faster than the hardware can accommodate them. Thanks to Juri Lelli for contributing design ideas, code and test results, and to Ricky Liang for initialization and static key inc/dec fixes. [0] http://article.gmane.org/gmane.linux.kernel/1499836 [smuckle@linaro.org: various additions and fixes, revised commit text] Change-Id: Idca9ccbd0da925a176289ce705eba20d396c2bc0 CC: Ricky Liang Signed-off-by: Michael Turquette Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804841/) Signed-off-by: Punit Agrawal --- drivers/cpufreq/Kconfig | 21 ++ include/linux/cpufreq.h | 3 + include/linux/sched.h | 8 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched.c | 358 +++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 51 +++++ 7 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq_sched.c diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 548311fd04c022..9ffc79f14e76af 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -112,6 +112,13 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE loading your cpufreq low-level hardware driver, using the 'interactive' governor for latency-sensitive workloads. +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency using CPU utilization estimates from the + scheduler. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -210,6 +217,20 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + bool "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is event-driven by the scheduler. + + If in doubt, say N. + +comment "CPU frequency scaling drivers" + config CPUFREQ_DT tristate "Generic DT based cpufreq driver" depends on HAVE_CLK && OF diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c055a7938c2b02..b2fc177cc499ee 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -495,6 +495,9 @@ extern struct cpufreq_governor cpufreq_gov_conservative; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE) extern struct cpufreq_governor cpufreq_gov_interactive; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_interactive) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) +extern struct cpufreq_governor cpufreq_gov_sched; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4643d4bc648aef..69e8ba15b6d8d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -863,6 +863,14 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +struct sched_capacity_reqs { + unsigned long cfs; + unsigned long rt; + unsigned long dl; + + unsigned long total; +}; + /* * sched-domains (multiprocessor balancing) declarations: */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 1baa3f1be29b01..6990634a518b59 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 00000000000000..58bca8d2ca653e --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2015 Michael Turquette + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; +static bool __read_mostly cpufreq_driver_slow; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static struct cpufreq_governor cpufreq_gov_sched; +#endif + +static DEFINE_PER_CPU(unsigned long, enabled); +DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @requested_freq: last frequency requested by the sched governor + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + unsigned int requested_freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, + unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +static bool finish_last_request(struct gov_data *gd) +{ + ktime_t now = ktime_get(); + + if (ktime_after(now, gd->throttle)) + return false; + + while (1) { + int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + + usec_left /= NSEC_PER_USEC; + usleep_range(usec_left, usec_left + 100); + now = ktime_get(); + if (ktime_after(now, gd->throttle)) + return true; + } +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned int new_request = 0; + unsigned int last_request = 0; + int ret; + + policy = (struct cpufreq_policy *) data; + gd = policy->governor_data; + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + do { + set_current_state(TASK_INTERRUPTIBLE); + new_request = gd->requested_freq; + if (new_request == last_request) { + schedule(); + } else { + /* + * if the frequency thread sleeps while waiting to be + * unthrottled, start over to check for a newer request + */ + if (finish_last_request(gd)) + continue; + last_request = new_request; + cpufreq_sched_try_driver_target(policy, new_request); + } + } while (!kthread_should_stop()); + + return 0; +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) + return; + + wake_up_process(gd->task); +} + +static void update_fdomain_capacity_request(int cpu) +{ + unsigned int freq_new, index_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity = 0; + + /* + * Avoid grabbing the policy if possible. A test is still + * required after locking the CPU's policy to avoid racing + * with the governor changing. + */ + if (!per_cpu(enabled, cpu)) + return; + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + return; + + if (policy->governor != &cpufreq_gov_sched || + !policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) { + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); + capacity = max(capacity, scr->total); + } + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + if (cpufreq_frequency_table_target(policy, policy->freq_table, + freq_new, CPUFREQ_RELATION_L, + &index_new)) + goto out; + freq_new = policy->freq_table[index_new].frequency; + + if (freq_new == gd->requested_freq) + goto out; + + gd->requested_freq = freq_new; + + /* + * Throttling is not yet supported on platforms with fast cpufreq + * drivers. + */ + if (cpufreq_driver_slow) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); +} + +void update_cpu_capacity_request(int cpu, bool request) +{ + unsigned long new_capacity; + struct sched_capacity_reqs *scr; + + /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ + lockdep_assert_held(&cpu_rq(cpu)->lock); + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + + new_capacity = scr->cfs + scr->rt; + new_capacity = new_capacity * capacity_margin + / SCHED_CAPACITY_SCALE; + new_capacity += scr->dl; + + if (new_capacity == scr->total) + return; + + scr->total = new_capacity; + if (request) + update_fdomain_capacity_request(cpu); +} + +static inline void set_sched_freq(void) +{ + static_key_slow_inc(&__sched_freq); +} + +static inline void clear_sched_freq(void) +{ + static_key_slow_dec(&__sched_freq); +} + +static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + for_each_cpu(cpu, policy->cpus) + memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, + sizeof(struct sched_capacity_reqs)); + + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) + return -ENOMEM; + + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_is_slow()) { + cpufreq_driver_slow = true; + gd->task = kthread_create(cpufreq_sched_thread, policy, + "kschedfreq:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kschedfreq thread\n", + __func__); + goto err; + } + get_task_struct(gd->task); + kthread_bind_mask(gd->task, policy->related_cpus); + wake_up_process(gd->task); + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + set_sched_freq(); + + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + clear_sched_freq(); + if (cpufreq_driver_slow) { + kthread_stop(gd->task); + put_task_struct(gd->task); + } + + policy->governor_data = NULL; + + kfree(gd); + return 0; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 1; + + return 0; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 0; + + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, + unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_POLICY_INIT: + return cpufreq_sched_policy_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return cpufreq_sched_policy_exit(policy); + case CPUFREQ_GOV_START: + return cpufreq_sched_start(policy); + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + case CPUFREQ_GOV_LIMITS: + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) + per_cpu(enabled, cpu) = 0; + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 019060c69247d0..08f49023f8ddf0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4890,7 +4890,7 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -static unsigned int capacity_margin = 1280; /* ~20% margin */ +unsigned int capacity_margin = 1280; /* ~20% margin */ static inline bool __task_fits(struct task_struct *p, int cpu, int util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c5d1981d17da71..467cb04675ef01 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1439,6 +1439,57 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +extern unsigned int capacity_margin; +extern struct static_key __sched_freq; + +static inline bool sched_freq(void) +{ + return static_key_false(&__sched_freq); +} + +DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +void update_cpu_capacity_request(int cpu, bool request); + +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; + update_cpu_capacity_request(cpu, request); + } +} +#else +static inline bool sched_freq(void) { return false; } +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); From ffc079318bb11251054b6f6a8bbf17ad64b8bbdb Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Aug 2015 19:47:12 +0100 Subject: [PATCH 192/420] sched/fair: add triggers for OPP change requests Each time a task is {en,de}queued we might need to adapt the current frequency to the new usage. Add triggers on {en,de}queue_task_fair() for this purpose. Only trigger a freq request if we are effectively waking up or going to sleep. Filter out load balancing related calls to reduce the number of triggers. [smuckle@linaro.org: resolve merge conflicts, define task_new, use renamed static key sched_freq] Change-Id: I96b33d33f6d2bf78c536612c4acd529f7be0ccf8 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7805021/) Signed-off-by: Punit Agrawal --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08f49023f8ddf0..a98b0e4012e697 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3975,6 +3975,21 @@ static inline void hrtick_update(struct rq *rq) } #endif +static unsigned long capacity_orig_of(int cpu); +static int cpu_util(int cpu); + +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} + static bool cpu_overutilized(int cpu); /* @@ -4024,6 +4039,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + * + * XXX: how about fork()? Do we need a special + * flag/something to tell if we are here after a + * fork() (wakeup_task_new)? + */ + if (!task_new) + update_capacity_of(cpu_of(rq)); } hrtick_update(rq); } @@ -4082,9 +4111,24 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } hrtick_update(rq); } From 863369e91676ed331c8d6018227cf0fd70761775 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 26 Jun 2015 12:14:23 +0100 Subject: [PATCH 193/420] sched/{core,fair}: trigger OPP change request on fork() Patch "sched/fair: add triggers for OPP change requests" introduced OPP change triggers for enqueue_task_fair(), but the trigger was operating only for wakeups. Fact is that it makes sense to consider wakeup_new also (i.e., fork()), as we don't know anything about a newly created task and thus we most certainly want to jump to max OPP to not harm performance too much. However, it is not currently possible (or at least it wasn't evident to me how to do so :/) to tell new wakeups from other (non wakeup) operations. This patch introduces an additional flag in sched.h that is only set at fork() time and it is then consumed in enqueue_task_fair() for our purpose. Change-Id: I0e485e7a2e6386f276eefa7920b2fc34f7877c22 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7805001/) Signed-off-by: Punit Agrawal --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 9 +++------ kernel/sched/sched.h | 4 +++- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 38b9ab3a1352b8..b399df2def4915 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2332,7 +2332,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a98b0e4012e697..5c695563bbb2e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4002,7 +4002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int task_new = !(flags & ENQUEUE_WAKEUP); + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; for_each_sched_entity(se) { if (se->on_rq) @@ -4046,12 +4047,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * because we get here also during load balancing, but * in these cases it seems wise to trigger as single * request after load balancing is done. - * - * XXX: how about fork()? Do we need a special - * flag/something to tell if we are here after a - * fork() (wakeup_task_new)? */ - if (!task_new) + if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } hrtick_update(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 467cb04675ef01..ce91ef821f71ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1128,7 +1128,9 @@ static const u32 prio_to_wmult[40] = { #else #define ENQUEUE_WAKING 0 #endif -#define ENQUEUE_REPLENISH 8 +#define ENQUEUE_REPLENISH 0x08 +#define ENQUEUE_RESTORE 0x10 +#define ENQUEUE_WAKEUP_NEW 0x20 #define DEQUEUE_SLEEP 1 From 7861cd8a17b63ce5299787f1056efa50e35c4b17 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 25 Jun 2015 14:37:27 +0100 Subject: [PATCH 194/420] sched/fair: cpufreq_sched triggers for load balancing As we don't trigger freq changes from {en,de}queue_task_fair() during load balancing, we need to do explicitly so on load balancing paths. [smuckle@linaro.org: move update_capacity_of calls so rq lock is held] Change-Id: Ief52383f752ee4b7394beaba0130690c0df95827 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804991/) Signed-off-by: Punit Agrawal --- kernel/sched/fair.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c695563bbb2e9..0bf824c9d8bf9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6228,6 +6228,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -6249,6 +6253,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -7515,6 +7524,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -7888,8 +7902,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } From e63b9c27061371965d09897cfaabd3d681cdfc72 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 25 Jun 2015 14:12:33 +0100 Subject: [PATCH 195/420] sched/fair: jump to max OPP when crossing UP threshold Since the true utilization of a long running task is not detectable while it is running and might be bigger than the current cpu capacity, create the maximum cpu capacity head room by requesting the maximum cpu capacity once the cpu usage plus the capacity margin exceeds the current capacity. This is also done to try to harm the performance of a task the least. Original fair-class only version authored by Juri Lelli . Change-Id: I42c4eda2a98e8e73dbd3e1c41d1b81a6e3e37f2e cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804971/) Signed-off-by: Punit Agrawal --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++ kernel/sched/fair.c | 66 ------------------------------------------ kernel/sched/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b399df2def4915..a531cf93c8ae9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2753,6 +2753,45 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +static unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = cfs_cap + scr->rt; + + total = total * capacity_margin; + total /= SCHED_CAPACITY_SCALE; + total += scr->dl; + return total; +} + +static void sched_freq_tick(int cpu) +{ + struct sched_capacity_reqs *scr; + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to max OPP as soon as the margin of free capacity is + * impacted (specified by capacity_margin). + */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) + set_cfs_cpu_capacity(cpu, true, capacity_max); +} +#else +static inline void sched_freq_tick(int cpu) { } +#endif + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2779,6 +2818,8 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0bf824c9d8bf9e..b4aad64ea33180 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3975,9 +3975,6 @@ static inline void hrtick_update(struct rq *rq) } #endif -static unsigned long capacity_orig_of(int cpu); -static int cpu_util(int cpu); - static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4347,15 +4344,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4524,60 +4512,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static unsigned long __cpu_util(int cpu, int delta) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - delta += util; - if (delta < 0) - return 0; - - return (delta >= capacity) ? capacity : delta; -} - -static unsigned long cpu_util(int cpu) -{ - return __cpu_util(cpu, 0); -} - static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce91ef821f71ce..92e71c396ceaf4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1441,7 +1441,75 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static inline unsigned long __cpu_util(int cpu, int delta) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); +} + +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static inline unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHED +#define capacity_max SCHED_CAPACITY_SCALE extern unsigned int capacity_margin; extern struct static_key __sched_freq; From 215d9ba60afa08fea593138abdf5cac3a7747095 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 20 Oct 2015 10:46:26 +0200 Subject: [PATCH 196/420] sched: remove call of sched_avg_update from sched_rt_avg_update rt_avg is only used to scale the available CPU's capacity for CFS tasks. As the update of this scaling is done during periodic load balance, we only have to ensure that sched_avg_update has been called before any periodic load balancing. This requirement is already fulfilled by __update_cpu_load so the call in sched_rt_avg_update, which is part of the hotpath, is useless. Change-Id: I65a8c9be85164a7c1b39be5380e8807c49db850b Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804851/) Signed-off-by: Punit Agrawal --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 92e71c396ceaf4..d59427ccdee1c3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1563,7 +1563,6 @@ static inline void set_dl_cpu_capacity(int cpu, bool request, static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); } #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } From 3cb8a18d9a32c5ed835910cbd6ff82e8dd9f6a17 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Nov 2015 10:39:01 +0100 Subject: [PATCH 197/420] sched: deadline: use deadline bandwidth in scale_rt_capacity Instead of monitoring the exec time of deadline tasks to evaluate the CPU capacity consumed by deadline scheduler class, we can directly calculate it thanks to the sum of utilization of deadline tasks on the CPU. We can remove deadline tasks from rt_avg metric and directly use the average bandwidth of deadline scheduler in scale_rt_capacity. Based in part on a similar patch from Luca Abeni . Change-Id: Ida81c088f21496dec049da9ee39dd17319d357d2 Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804861/) Signed-off-by: Punit Agrawal --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 8 ++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 19a26186e074c7..afb4df8f8655f6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw += se_bw; +} + +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw -= se_bw; + if (dl_rq->avg_bw < 0) { + WARN_ON(1); + dl_rq->avg_bw = 0; + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -477,6 +495,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + if (dl_se->dl_new) + add_average_bw(dl_se, dl_rq); + /* * The arrival of a new instance needs special treatment, i.e., * the actual scheduling parameters have to be "renewed". @@ -740,8 +761,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { dl_se->dl_throttled = 1; @@ -1218,6 +1237,8 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_rq *dl_rq = dl_rq_of_se(&p->dl); + struct rq *rq = rq_of_dl_rq(dl_rq); /* * Since we are TASK_DEAD we won't slip out of the domain! @@ -1226,6 +1247,8 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); + + clear_average_bw(&p->dl, &rq->dl); } static void set_curr_task_dl(struct rq *rq) @@ -1498,7 +1521,9 @@ static int push_dl_task(struct rq *rq) } deactivate_task(rq, next_task, 0); + clear_average_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); resched_curr(later_rq); @@ -1584,7 +1609,9 @@ static int pull_dl_task(struct rq *this_rq) ret = 1; deactivate_task(src_rq, p, 0); + clear_average_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); + add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1703,6 +1730,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!start_dl_timer(p)) __dl_clear_params(p); + clear_average_bw(&p->dl, &rq->dl); + /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4aad64ea33180..42f38e26ac9922 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6394,6 +6394,14 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d59427ccdee1c3..413d2c04be57a6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -483,6 +483,8 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* This is the "average utilization" for this runqueue */ + s64 avg_bw; }; #ifdef CONFIG_SMP From 69dd304c1bcb5c8314324c72ed687ba9d2a2ad99 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 26 Oct 2015 18:14:50 +0100 Subject: [PATCH 198/420] sched: rt scheduler sets capacity requirement RT tasks don't provide any running constraints like deadline ones except their running priority. The only current usable input to estimate the capacity needed by RT tasks is the rt_avg metric. We use it to estimate the CPU capacity needed for the RT scheduler class. In order to monitor the evolution for RT task load, we must peridiocally check it during the tick. Then, we use the estimated capacity of the last activity to estimate the next one which can not be that accurate but is a good starting point without any impact on the wake up path of RT tasks. Change-Id: I7acd9741adbe94a8a9763f77fb1d62de5d5eff8a Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle (am from https://patchwork.kernel.org/patch/7804941/) Signed-off-by: Punit Agrawal --- kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6fba352232f92d..30d757853a535e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1400,6 +1400,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +#ifdef CONFIG_SMP +static void sched_rt_update_capacity_req(struct rq *rq) +{ + u64 total, used, age_stamp, avg; + s64 delta; + + if (!sched_freq()) + return; + + sched_avg_update(rq); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = rq_clock(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + if (unlikely(used > SCHED_CAPACITY_SCALE)) + used = SCHED_CAPACITY_SCALE; + + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); +} +#else +static inline void sched_rt_update_capacity_req(struct rq *rq) +{ } + +#endif + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1460,8 +1495,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!rt_rq->rt_queued) { + /* + * The next task to be picked on this rq will have a lower + * priority than rt tasks so we can spend some time to update + * the capacity used by rt tasks based on the last activity. + * This value will be the used as an estimation of the next + * activity. + */ + sched_rt_update_capacity_req(rq); return NULL; + } put_prev_task(rq, prev); @@ -2051,6 +2095,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); + if (rq->rt.rt_nr_running) + sched_rt_update_capacity_req(rq); + watchdog(rq, p); /* From 695888e0286fe775738e71606ccabafac43c22f2 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:55:51 +0000 Subject: [PATCH 199/420] FIXUP: sched: scheduler-driven cpu frequency selection Change-Id: I61162ccf7a5319c7c24f5a69e6a366305592f3a5 Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 58bca8d2ca653e..af27a9765f326b 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -115,9 +115,9 @@ static int cpufreq_sched_thread(void *data) } do { - set_current_state(TASK_INTERRUPTIBLE); new_request = gd->requested_freq; if (new_request == last_request) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); } else { /* From ade9f9a60c13177e1af5ec4ca634313e23c8a35e Mon Sep 17 00:00:00 2001 From: Ricky Liang Date: Tue, 2 Feb 2016 01:12:06 +0800 Subject: [PATCH 200/420] FIXUP: sched: scheduler-driven cpu frequency selection Two fixups that have been reported on LKML. The next version of scheduler-driver cpu frequency selection patch set should include these fixes and we can drop this patch then. Signed-off-by: Ricky Liang Change-Id: Ia2f8b5c0dd5dac06580256eeb4b259929688af68 --- kernel/sched/cpufreq_sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index af27a9765f326b..9f21c308ef7200 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -118,6 +118,8 @@ static int cpufreq_sched_thread(void *data) new_request = gd->requested_freq; if (new_request == last_request) { set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; schedule(); } else { /* @@ -256,6 +258,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec); + policy->governor_data = gd; if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; gd->task = kthread_create(cpufreq_sched_thread, policy, @@ -272,12 +275,12 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); } - policy->governor_data = gd; set_sched_freq(); return 0; err: + policy->governor_data = NULL; kfree(gd); return -ENOMEM; } From 8e1c4cbe586e40422e50b9a607cfa01662686889 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:58:05 +0000 Subject: [PATCH 201/420] FIXUP: sched/fair: jump to max OPP when crossing UP threshold Change-Id: I1e049d8fce980e265852c74977032f63704bfdf9 Signed-off-by: Juri Lelli --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a531cf93c8ae9f..1d307a0e898df0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2809,6 +2809,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2818,8 +2819,6 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); - - sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL From 47c06320a64b6090faa5676b2a6decfd56c4aef2 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 25 Nov 2015 15:59:25 -0800 Subject: [PATCH 202/420] sched/cpufreq_sched: add trace events Trace events will aid in debugging, profiling and tuning. Change-Id: I714e1875a6509e6da4308fa2e76a55ad107b35a5 Signed-off-by: Steve Muckle --- include/trace/events/cpufreq_sched.h | 87 ++++++++++++++++++++++++++++ kernel/sched/cpufreq_sched.c | 9 +++ 2 files changed, 96 insertions(+) create mode 100644 include/trace/events/cpufreq_sched.h diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h new file mode 100644 index 00000000000000..a46cd088e96990 --- /dev/null +++ b/include/trace/events/cpufreq_sched.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2015 Steve Muckle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpufreq_sched + +#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUFREQ_SCHED_H + +#include +#include + +TRACE_EVENT(cpufreq_sched_throttled, + TP_PROTO(unsigned int rem), + TP_ARGS(rem), + TP_STRUCT__entry( + __field( unsigned int, rem) + ), + TP_fast_assign( + __entry->rem = rem; + ), + TP_printk("throttled - %d usec remaining", __entry->rem) +); + +TRACE_EVENT(cpufreq_sched_request_opp, + TP_PROTO(int cpu, + unsigned long capacity, + unsigned int freq_new, + unsigned int requested_freq), + TP_ARGS(cpu, capacity, freq_new, requested_freq), + TP_STRUCT__entry( + __field( int, cpu) + __field( unsigned long, capacity) + __field( unsigned int, freq_new) + __field( unsigned int, requested_freq) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->capacity = capacity; + __entry->freq_new = freq_new; + __entry->requested_freq = requested_freq; + ), + TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d " + "(cur %d)", + __entry->cpu, __entry->capacity, __entry->freq_new, + __entry->requested_freq) +); + +TRACE_EVENT(cpufreq_sched_update_capacity, + TP_PROTO(int cpu, + bool request, + struct sched_capacity_reqs *scr, + unsigned long new_capacity), + TP_ARGS(cpu, request, scr, new_capacity), + TP_STRUCT__entry( + __field( int, cpu) + __field( bool, request) + __field( unsigned long, cfs) + __field( unsigned long, rt) + __field( unsigned long, dl) + __field( unsigned long, total) + __field( unsigned long, new_total) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->request = request; + __entry->cfs = scr->cfs; + __entry->rt = scr->rt; + __entry->dl = scr->dl; + __entry->total = scr->total; + __entry->new_total = new_capacity; + ), + TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld " + "new_tot=%ld", + __entry->cpu, __entry->request, __entry->cfs, __entry->rt, + __entry->dl, __entry->total, __entry->new_total) +); + +#endif /* _TRACE_CPUFREQ_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 9f21c308ef7200..c72537c0c50019 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "sched.h" #define THROTTLE_NSEC 50000000 /* 50ms default */ @@ -78,6 +81,7 @@ static bool finish_last_request(struct gov_data *gd) int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); usec_left /= NSEC_PER_USEC; + trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); if (ktime_after(now, gd->throttle)) @@ -188,6 +192,9 @@ static void update_fdomain_capacity_request(int cpu) goto out; freq_new = policy->freq_table[index_new].frequency; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, + gd->requested_freq); + if (freq_new == gd->requested_freq) goto out; @@ -224,6 +231,8 @@ void update_cpu_capacity_request(int cpu, bool request) if (new_capacity == scr->total) return; + trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); + scr->total = new_capacity; if (request) update_fdomain_capacity_request(cpu); From 3b0bca53c3fa41db088a3e8d2f97ece1ab76a6b2 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 30 Jun 2015 12:03:26 +0100 Subject: [PATCH 203/420] sched/tune: add detailed documentation The topic of a single simple power-performance tunable, that is wholly scheduler centric, and has well defined and predictable properties has come up on several occasions in the past. With techniques such as a scheduler driven DVFS, we now have a good framework for implementing such a tunable. This patch provides a detailed description of the motivations and design decisions behind the implementation of the SchedTune. Change-Id: Ia3c269ecdaab80f1ea8f79a34b795a5c7504a881 cc: Jonathan Corbet cc: linux-doc@vger.kernel.org Signed-off-by: Patrick Bellasi --- Documentation/scheduler/sched-tune.txt | 367 +++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 Documentation/scheduler/sched-tune.txt diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt new file mode 100644 index 00000000000000..cb795e643eba1f --- /dev/null +++ b/Documentation/scheduler/sched-tune.txt @@ -0,0 +1,367 @@ + Central, scheduler-driven, power-performance control + (EXPERIMENTAL) + +Abstract +======== + +The topic of a single simple power-performance tunable, that is wholly +scheduler centric, and has well defined and predictable properties has come up +on several occasions in the past [1,2]. With techniques such as a scheduler +driven DVFS [3], we now have a good framework for implementing such a tunable. +This document describes the overall ideas behind its design and implementation. + + +Table of Contents +================= + +1. Motivation +2. Introduction +3. Signal Boosting Strategy +4. OPP selection using boosted CPU utilization +5. Per task group boosting +6. Question and Answers + - What about "auto" mode? + - What about boosting on a congested system? + - How CPUs are boosted when we have tasks with multiple boost values? +7. References + + +1. Motivation +============= + +Sched-DVFS [3] is a new event-driven cpufreq governor which allows the +scheduler to select the optimal DVFS operating point (OPP) for running a task +allocated to a CPU. The introduction of sched-DVFS enables running workloads at +the most energy efficient OPPs. + +However, sometimes it may be desired to intentionally boost the performance of +a workload even if that could imply a reasonable increase in energy +consumption. For example, in order to reduce the response time of a task, we +may want to run the task at a higher OPP than the one that is actually required +by it's CPU bandwidth demand. + +This last requirement is especially important if we consider that one of the +main goals of the sched-DVFS component is to replace all currently available +CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling +driven governors we currently have, it is already more responsive at selecting +the optimal OPP to run tasks allocated to a CPU. However, just tracking the +actual task load demand may not be enough from a performance standpoint. For +example, it is not possible to get behaviors similar to those provided by the +"performance" and "interactive" CPUFreq governors. + +This document describes an implementation of a tunable, stacked on top of the +sched-DVFS which extends its functionality to support task performance +boosting. + +By "performance boosting" we mean the reduction of the time required to +complete a task activation, i.e. the time elapsed from a task wakeup to its +next deactivation (e.g. because it goes back to sleep or it terminates). For +example, if we consider a simple periodic task which executes the same workload +for 5[s] every 20[s] while running at a certain OPP, a boosted execution of +that task must complete each of its activations in less than 5[s]. + +A previous attempt [5] to introduce such a boosting feature has not been +successful mainly because of the complexity of the proposed solution. The +approach described in this document exposes a single simple interface to +user-space. This single tunable knob allows the tuning of system wide +scheduler behaviours ranging from energy efficiency at one end through to +incremental performance boosting at the other end. This first tunable affects +all tasks. However, a more advanced extension of the concept is also provided +which uses CGroups to boost the performance of only selected tasks while using +the energy efficient default for all others. + +The rest of this document introduces in more details the proposed solution +which has been named SchedTune. + + +2. Introduction +=============== + +SchedTune exposes a simple user-space interface with a single power-performance +tunable: + + /proc/sys/kernel/sched_cfs_boost + +This permits expressing a boost value as an integer in the range [0..100]. + +A value of 0 (default) configures the CFS scheduler for maximum energy +efficiency. This means that sched-DVFS runs the tasks at the minimum OPP +required to satisfy their workload demand. +A value of 100 configures scheduler for maximum performance, which translates +to the selection of the maximum OPP on that CPU. + +The range between 0 and 100 can be set to satisfy other scenarios suitably. For +example to satisfy interactive response or depending on other system events +(battery level etc). + +A CGroup based extension is also provided, which permits further user-space +defined task classification to tune the scheduler for different goals depending +on the specific nature of the task, e.g. background vs interactive vs +low-priority. + +The overall design of the SchedTune module is built on top of "Per-Entity Load +Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating +Performance Point (OPP) selection. +Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune +the operating frequency of that CPU to better match the workload demand. The +selection of the actual OPP being activated is influenced by the global boost +value, or the boost value for the task CGroup when in use. + +This simple biasing approach leverages existing frameworks, which means minimal +modifications to the scheduler, and yet it allows to achieve a range of +different behaviours all from a single simple tunable knob. +The only new concept introduced is that of signal boosting. + + +3. Signal Boosting Strategy +=========================== + +The whole PELT machinery works based on the value of a few load tracking signals +which basically track the CPU bandwidth requirements for tasks and the capacity +of CPUs. The basic idea behind the SchedTune knob is to artificially inflate +some of these load tracking signals to make a task or RQ appears more demanding +that it actually is. + +Which signals have to be inflated depends on the specific "consumer". However, +independently from the specific (signal, consumer) pair, it is important to +define a simple and possibly consistent strategy for the concept of boosting a +signal. + +A boosting strategy defines how the "abstract" user-space defined +sched_cfs_boost value is translated into an internal "margin" value to be added +to a signal to get its inflated value: + + margin := boosting_strategy(sched_cfs_boost, signal) + boosted_signal := signal + margin + +Different boosting strategies were identified and analyzed before selecting the +one found to be most effective. + +Signal Proportional Compensation (SPC) +-------------------------------------- + +In this boosting strategy the sched_cfs_boost value is used to compute a +margin which is proportional to the complement of the original signal. +When a signal has a maximum possible value, its complement is defined as +the delta from the actual value and its possible maximum. + +Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as +the maximum possible value, the margin becomes: + + margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal) + +Using this boosting strategy: +- a 100% sched_cfs_boost means that the signal is scaled to the maximum value +- each value in the range of sched_cfs_boost effectively inflates the signal in + question by a quantity which is proportional to the maximum value. + +For example, by applying the SPC boosting strategy to the selection of the OPP +to run a task it is possible to achieve these behaviors: + +- 0% boosting: run the task at the minimum OPP required by its workload +- 100% boosting: run the task at the maximum OPP available for the CPU +- 50% boosting: run at the half-way OPP between minimum and maximum + +Which means that, at 50% boosting, a task will be scheduled to run at half of +the maximum theoretically achievable performance on the specific target +platform. + +A graphical representation of an SPC boosted signal is represented in the +following figure where: + a) "-" represents the original signal + b) "b" represents a 50% boosted signal + c) "p" represents a 100% boosted signal + + + ^ + | SCHED_LOAD_SCALE + +-----------------------------------------------------------------+ + |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp + | + | boosted_signal + | bbbbbbbbbbbbbbbbbbbbbbbb + | + | original signal + | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+ + | | + |bbbbbbbbbbbbbbbbbb | + | | + | | + | | + | +-----------------------+ + | | + | | + | | + |------------------+ + | + | + +-----------------------------------------------------------------------> + +The plot above shows a ramped load signal (titled 'original_signal') and it's +boosted equivalent. For each step of the original signal the boosted signal +corresponding to a 50% boost is midway from the original signal and the upper +bound. Boosting by 100% generates a boosted signal which is always saturated to +the upper bound. + + +4. OPP selection using boosted CPU utilization +============================================== + +It is worth calling out that the implementation does not introduce any new load +signals. Instead, it provides an API to tune existing signals. This tuning is +done on demand and only in scheduler code paths where it is sensible to do so. +The new API calls are defined to return either the default signal or a boosted +one, depending on the value of sched_cfs_boost. This is a clean an non invasive +modification of the existing existing code paths. + +The signal representing a CPU's utilization is boosted according to the +previously described SPC boosting strategy. To sched-DVFS, this allows a CPU +(ie CFS run-queue) to appear more used then it actually is. + +Thus, with the sched_cfs_boost enabled we have the following main functions to +get the current utilization of a CPU: + + cpu_util() + boosted_cpu_util() + +The new boosted_cpu_util() is similar to the first but returns a boosted +utilization signal which is a function of the sched_cfs_boost value. + +This function is used in the CFS scheduler code paths where sched-DVFS needs to +decide the OPP to run a CPU at. +For example, this allows selecting the highest OPP for a CPU which has +the boost value set to 100%. + + +5. Per task group boosting +========================== + +The availability of a single knob which is used to boost all tasks in the +system is certainly a simple solution but it quite likely doesn't fit many +utilization scenarios, especially in the mobile device space. + +For example, on battery powered devices there usually are many background +services which are long running and need energy efficient scheduling. On the +other hand, some applications are more performance sensitive and require an +interactive response and/or maximum performance, regardless of the energy cost. +To better service such scenarios, the SchedTune implementation has an extension +that provides a more fine grained boosting interface. + +A new CGroup controller, namely "schedtune", could be enabled which allows to +defined and configure task groups with different boosting values. +Tasks that require special performance can be put into separate CGroups. +The value of the boost associated with the tasks in this group can be specified +using a single knob exposed by the CGroup controller: + + schedtune.boost + +This knob allows the definition of a boost value that is to be used for +SPC boosting of all tasks attached to this group. + +The current schedtune controller implementation is really simple and has these +main characteristics: + + 1) It is only possible to create 1 level depth hierarchies + + The root control groups define the system-wide boost value to be applied + by default to all tasks. Its direct subgroups are named "boost groups" and + they define the boost value for specific set of tasks. + Further nested subgroups are not allowed since they do not have a sensible + meaning from a user-space standpoint. + + 2) It is possible to define only a limited number of "boost groups" + + This number is defined at compile time and by default configured to 16. + This is a design decision motivated by two main reasons: + a) In a real system we do not expect utilization scenarios with more then few + boost groups. For example, a reasonable collection of groups could be + just "background", "interactive" and "performance". + b) It simplifies the implementation considerably, especially for the code + which has to compute the per CPU boosting once there are multiple + RUNNABLE tasks with different boost values. + +Such a simple design should allow servicing the main utilization scenarios identified +so far. It provides a simple interface which can be used to manage the +power-performance of all tasks or only selected tasks. +Moreover, this interface can be easily integrated by user-space run-times (e.g. +Android, ChromeOS) to implement a QoS solution for task boosting based on tasks +classification, which has been a long standing requirement. + +Setup and usage +--------------- + +0. Use a kernel with CGROUP_SCHEDTUNE support enabled + +1. Check that the "schedtune" CGroup controller is available: + + root@linaro-nano:~# cat /proc/cgroups + #subsys_name hierarchy num_cgroups enabled + cpuset 0 1 1 + cpu 0 1 1 + schedtune 0 1 1 + +2. Mount a tmpfs to create the CGroups mount point (Optional) + + root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup + +3. Mount the "schedtune" controller + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune + root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune + +4. Setup the system-wide boost value (Optional) + + If not configured the root control group has a 0% boost value, which + basically disables boosting for all tasks in the system thus running in + an energy-efficient mode. + + root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost + +5. Create task groups and configure their specific boost value (Optional) + + For example here we create a "performance" boost group configure to boost + all its tasks to 100% + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance + root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost + +6. Move tasks into the boost group + + For example, the following moves the tasks with PID $TASKPID (and all its + threads) into the "performance" boost group. + + root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs + +This simple configuration allows only the threads of the $TASKPID task to run, +when needed, at the highest OPP in the most capable CPU of the system. + + +6. Question and Answers +======================= + +What about "auto" mode? +----------------------- + +The 'auto' mode as described in [5] can be implemented by interfacing SchedTune +with some suitable user-space element. This element could use the exposed +system-wide or cgroup based interface. + +How are multiple groups of tasks with different boost values managed? +--------------------------------------------------------------------- + +The current SchedTune implementation keeps track of the boosted RUNNABLE tasks +on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization +is boosted with a value which is the maximum of the boost values of the +currently RUNNABLE tasks in its RQ. + +This allows sched-DVFS to boost a CPU only while there are boosted tasks ready +to run and switch back to the energy efficient mode as soon as the last boosted +task is dequeued. + + +7. References +============= +[1] http://lwn.net/Articles/552889 +[2] http://lkml.org/lkml/2012/5/18/91 +[3] http://lkml.org/lkml/2015/6/26/620 + From 63c8fad2b06805ef88f1220551289f0a3c3529f1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:11:44 +0100 Subject: [PATCH 204/420] sched/tune: add sysctl interface to define a boost value The current (CFS) scheduler implementation does not allow "to boost" tasks performance by running them at a higher OPP compared to the minimum required to meet their workload demands. To support tasks performance boosting the scheduler should provide a "knob" which allows to tune how much the system is going to be optimised for energy efficiency vs performance. This patch is the first of a series which provides a simple interface to define a tuning knob. One system-wide "boost" tunable is exposed via: /proc/sys/kernel/sched_cfs_boost which can be configured in the range [0..100], to define a percentage where: - 0% boost requires to operate in "standard" mode by scheduling tasks at the minimum capacities required by the workload demand - 100% boost requires to push at maximum the task performances, "regardless" of the incurred energy consumption A boost value in between these two boundaries is used to bias the power/performance trade-off, the higher the boost value the more the scheduler is biased toward performance boosting instead of energy efficiency. Change-Id: I59a41725e2d8f9238a61dfb0c909071b53560fc0 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/sched/sysctl.h | 16 ++++++++++++++++ init/Kconfig | 26 ++++++++++++++++++++++++++ kernel/sched/Makefile | 1 + kernel/sched/tune.c | 17 +++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 5 files changed, 71 insertions(+) create mode 100644 kernel/sched/tune.c diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 596a0e007c62d9..9cad78e74e2c04 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -89,6 +89,22 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_SCHED_TUNE +extern unsigned int sysctl_sched_cfs_boost; +int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return sysctl_sched_cfs_boost; +} +#else +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return 0; +} +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/init/Kconfig b/init/Kconfig index 2081a4d3d9171f..02539cb00d0342 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1230,6 +1230,32 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TUNE + bool "Boosting for CFS tasks (EXPERIMENTAL)" + help + This option enables the system-wide support for task boosting. + When this support is enabled a new sysctl interface is exposed to + userspace via: + /proc/sys/kernel/sched_cfs_boost + which allows to set a system-wide boost value in range [0..100]. + + The currently boosting strategy is implemented in such a way that: + - a 0% boost value requires to operate in "standard" mode by + scheduling all tasks at the minimum capacities required by their + workload demand + - a 100% boost value requires to push at maximum the task + performances, "regardless" of the incurred energy consumption + + A boost value in between these two boundaries is used to bias the + power/performance trade-off, the higher the boost value the more the + scheduler is biased toward performance boosting instead of energy + efficiency. + + Since this support exposes a single system-wide knob, the specified + boost value is applied to all (CFS) tasks in the system. + + If unsure, say N. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 6990634a518b59..a17c2a58912e6d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 00000000000000..4c44b1a4ad98aa --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,17 @@ +#include "sched.h" + +unsigned int sysctl_sched_cfs_boost __read_mostly; + +int +sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + return 0; +} + diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a1854266ed85a7..a76c38295a6a5c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -445,6 +445,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#ifdef CONFIG_SCHED_TUNE + { + .procname = "sched_cfs_boost", + .data = &sysctl_sched_cfs_boost, + .maxlen = sizeof(sysctl_sched_cfs_boost), + .mode = 0644, + .proc_handler = &sysctl_sched_cfs_boost_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", From 2ed8a2859829101abc85770033d33531a82fa5b3 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:32:36 +0100 Subject: [PATCH 205/420] sched/fair: add function to convert boost value into "margin" The basic idea of the boost knob is to "artificially inflate" a signal to make a task or logical CPU appears more demanding than it actually is. Independently from the specific signal, a consistent and possibly simple semantic for the concept of "signal boosting" must define: 1. how we translate the boost percentage into a "margin" value to be added to the original signal to inflate 2. what is the meaning of a boost value from a user-space perspective This patch provides the implementation of a possible boost semantic, named "Signal Proportional Compensation" (SPC), where the boost percentage (BP) is used to compute a margin (M) which is proportional to the complement of the original signal (OS): M = BP * (SCHED_LOAD_SCALE - OS) The computed margin then added to the OS to obtain the Boosted Signal (BS) BS = OS + M The proposed boost semantic has these main features: - each signal gets a boost which is proportional to its delta with respect to the maximum available capacity in the system (i.e. SCHED_LOAD_SCALE) - a 100% boosting has a clear understanding from a user-space perspective, since it means simply to run (possibly) "all" tasks at the max OPP - each boosting value means to improve the task performance by a quantity which is proportional to the maximum achievable performance on that system Thus this semantics is somehow forcing a behaviour which is: 50% boosting means to run at half-way between the current and the maximum performance which a task could achieve on that system This patch provides the code to implement a fast integer division to convert a boost percentage (BP) value into a margin (M). NOTE: this code is suitable for all signals operating in range [0..SCHED_LOAD_SCALE] Change-Id: Ic221df9a266b1ae1584a9dcae6d4d3dc79c96ebb cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42f38e26ac9922..55e02de776d1ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4900,6 +4900,44 @@ static bool cpu_overutilized(int cpu) return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); } +#ifdef CONFIG_SCHED_TUNE + +static unsigned long +schedtune_margin(unsigned long signal, unsigned long boost) +{ + unsigned long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_LOAD_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + + /* + * Fast integer division by constant: + * Constant : (C) = 100 + * Precision : 0.1% (P) = 0.1 + * Reference : C * 100 / P (R) = 100000 + * + * Thus: + * Shift bits : ceil(log(R,2)) (S) = 17 + * Mult const : round(2^S/C) (M) = 1311 + * + * + */ + margin *= 1311; + margin >>= 17; + + return margin; +} + +#endif /* CONFIG_SCHED_TUNE */ + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 656ef9960f3443cd655bb6416464e171f237a1e5 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 26 Jun 2015 09:55:06 +0100 Subject: [PATCH 206/420] sched/fair: add boosted CPU usage The CPU usage signal is used by the scheduler as an estimation of the overall bandwidth currently allocated on a CPU. When SchedDVFS is in use, this signal affects the selection of the operating points (OPP) required to accommodate all the workload allocated in a CPU. A convenient way to boost the performance of tasks running on a CPU, which is also little intrusive, is to boost the CPU usage signal each time it is used to select an OPP. This patch introduces a new function: get_boosted_cpu_usage(cpu) to return a boosted value for the usage of a specified CPU. The margin added to the original usage is: 1. computed based on the "boosting strategy" in use 2. proportional to the system-wide boost value defined by provided user-space interface The boosted signal is used by SchedDVFS (transparently) each time it requires to get an estimation of the capacity required for a CPU. Change-Id: I92db4404eef236b736be8a9345e6e0018fbf489a cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55e02de776d1ab..3c562bdef560bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3975,6 +3975,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static inline unsigned long boosted_cpu_util(int cpu); + static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -3983,7 +3985,8 @@ static void update_capacity_of(int cpu) return; /* Convert scale-invariant capacity to cpu. */ - req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } @@ -4936,8 +4939,36 @@ schedtune_margin(unsigned long signal, unsigned long boost) return margin; } +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + unsigned int boost = get_sysctl_sched_cfs_boost(); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ +static inline unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + unsigned long margin = schedtune_cpu_margin(util); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 3bd2d37830bc81fbfbbcb1d518898c7317962b42 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 23 Jun 2015 09:17:54 +0100 Subject: [PATCH 207/420] sched/tune: add initial support for CGroups based boosting To support task performance boosting, the usage of a single knob has the advantage to be a simple solution, both from the implementation and the usability standpoint. However, on a real system it can be difficult to identify a single value for the knob which fits the needs of multiple different tasks. For example, some kernel threads and/or user-space background services should be better managed the "standard" way while we still want to be able to boost the performance of specific workloads. In order to improve the flexibility of the task boosting mechanism this patch is the first of a small series which extends the previous implementation to introduce a "per task group" support. This first patch introduces just the basic CGroups support, a new "schedtune" CGroups controller is added which allows to configure different boost value for different groups of tasks. To keep the implementation simple but still effective for a boosting strategy, the new controller: 1. allows only a two layer hierarchy 2. supports only a limited number of boost groups A two layer hierarchy allows to place each task either: a) in the root control group thus being subject to a system-wide boosting value b) in a child of the root group thus being subject to the specific boost value defined by that "boost group" The limited number of "boost groups" supported is mainly motivated by the observation that in a real system it could be useful to have only few classes of tasks which deserve different treatment. For example, background vs foreground or interactive vs low-priority. As an additional benefit, a limited number of boost groups allows also to have a simpler implementation especially for the code required to compute the boost value for CPUs which have runnable tasks belonging to different boost groups. Change-Id: I1304e33a8440bfdad9c8bcf8129ff390216f2e32 cc: Tejun Heo cc: Li Zefan cc: Johannes Weiner cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/cgroup_subsys.h | 4 + init/Kconfig | 17 +++ kernel/sched/tune.c | 223 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 4 + 4 files changed, 248 insertions(+) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb1440356..23befa049e6525 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -15,6 +15,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE) +SUBSYS(schedtune) +#endif + #if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(blkio) #endif diff --git a/init/Kconfig b/init/Kconfig index 02539cb00d0342..7c9e6210c3883b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -985,6 +985,23 @@ config RESOURCE_COUNTERS This option enables controller independent resource accounting infrastructure that works with cgroups. +config CGROUP_SCHEDTUNE + bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)" + depends on SCHED_TUNE + help + This option provides the "schedtune" controller which improves the + flexibility of the task boosting mechanism by introducing the support + to define "per task" boost values. + + This new controller: + 1. allows only a two layers hierarchy, where the root defines the + system-wide boost value and its direct childrens define each one a + different "class of tasks" to be boosted with a different value + 2. supports up to 16 different task classes, each one which could be + configured with a different boost value + + Say N if unsure. + config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4c44b1a4ad98aa..c4b611424376d4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,230 @@ +#include +#include +#include +#include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; +#ifdef CONFIG_CGROUP_SCHEDTUNE + +/* + * EAS scheduler tunables for task groups. + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, +}; + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 4 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + unsigned boost_max; + struct { + /* The boost for tasks on that boost group */ + unsigned boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static u64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 boost) +{ + struct schedtune *st = css_st(css); + + if (boost < 0 || boost > 100) + return -EINVAL; + + st->boost = boost; + if (css == &root_schedtune.css) + sysctl_sched_cfs_boost = boost; + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_u64 = boost_read, + .write_u64 = boost_write, + }, + { } /* terminate */ +}; + +static int +schedtune_boostgroup_init(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + return 0; +} + +static int +schedtune_init(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info(" schedtune configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + return 0; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) { + schedtune_init(); + return &root_schedtune.css; + } + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; + + return &st->css; + +release: + kfree(st); +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .legacy_cftypes = files, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a76c38295a6a5c..5630f4282f9552 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -450,7 +450,11 @@ static struct ctl_table kern_table[] = { .procname = "sched_cfs_boost", .data = &sysctl_sched_cfs_boost, .maxlen = sizeof(sysctl_sched_cfs_boost), +#ifdef CONFIG_CGROUP_SCHEDTUNE + .mode = 0444, +#else .mode = 0644, +#endif .proc_handler = &sysctl_sched_cfs_boost_handler, .extra1 = &zero, .extra2 = &one_hundred, From 727e36b8daaeb49668df9b9dae6567ad76feb755 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 12:31:35 +0000 Subject: [PATCH 208/420] sched/tune: compute and keep track of per CPU boost value When per task boosting is enabled, we could have multiple RUNNABLE tasks which are concurrently scheduled on the same CPU but each one with a different boost value. For example, we could have a scenarios like this: Task SchedTune CGroup Boost Value T1 root 0 T2 low-priority 10 T3 interactive 90 In these conditions we expect a CPU to be configured according to a proper "aggregation" of the required boost values for all the tasks currently scheduled on this CPU. A suitable aggregation function is the one which tracks the MAX boost value for all the tasks RUNNABLE on a CPU. This approach allows to always satisfy the most boost demanding task while at the same time: a) boosting all the concurrently scheduled tasks thus reducing potential co-scheduling side-effects on demanding tasks b) reduce the number of frequency switch requested towards SchedDVFS, thus being more friendly to architectures with slow frequency switching times Every time a task enters/exits the RQ of a CPU the max boost value should be updated considering all the boost groups currently "affecting" that CPU, i.e. which have at least one RUNNABLE task currently allocated on that CPU. This patch introduces the required support to keep track of the boost groups currently affecting CPUs. Thanks to the limited number of boost groups, a small and memory efficient per-cpu array of boost groups values (cpu_boost_groups) is used which is updated for each CPU entry by schedtune_boostgroup_update() but only when a schedtune CGroup boost value is updated. However, this is expected to be a rare operation, perhaps done just one time at system boot time. Change-Id: I018301814216f0db98f8c7eed2898e2c6d10ef02 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 77 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index c4b611424376d4..be60b8d97dc4de 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -97,6 +97,67 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg; + unsigned boost_max; + int idx; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + boost_max = max(boost_max, bg->group[idx].boost); + } + + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) + schedtune_cpu_update(cpu); + } + + return 0; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -118,6 +179,9 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, if (css == &root_schedtune.css) sysctl_sched_cfs_boost = boost; + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + return 0; } @@ -133,9 +197,19 @@ static struct cftype files[] = { static int schedtune_boostgroup_init(struct schedtune *st) { + struct boost_groups *bg; + int cpu; + /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; + } + return 0; } @@ -203,6 +277,9 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) static void schedtune_boostgroup_release(struct schedtune *st) { + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); + /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } From 6725b9e9cf439438f7833c5feb1ea081e812d7dc Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 7 Jul 2015 15:33:20 +0100 Subject: [PATCH 209/420] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value When per-task boosting is enabled, every time a task enters/exits a CPU its boost value could impact the currently selected OPP for that CPU. Thus, the "aggregated" boost value for that CPU potentially needs to be updated to match the current maximum boost value among all the tasks currently RUNNABLE on that CPU. This patch introduces the required support to keep track of which boost groups are impacting a CPU. Each time a task is enqueued/dequeued to/from a CPU its boost group is used to increment a per-cpu counter of RUNNABLE tasks on that CPU. Only when the number of runnable tasks for a specific boost group becomes 1 or 0 the corresponding boost group changes its effects on that CPU, specifically: a) boost_group::tasks == 1: this boost group starts to impact the CPU b) boost_group::tasks == 0: this boost group stops to impact the CPU In each of these two conditions the aggregation function: sched_cpu_update(cpu) could be required to run in order to identify the new maximum boost value required for the CPU. The proposed patch minimizes the number of times the aggregation function is executed while still providing the required support to always boost a CPU to the maximum boost value required by all its currently RUNNABLE tasks. Change-Id: I345e36f349381a88f0caa14abb12f9f6b06b822a cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 17 +++++++--- kernel/sched/tune.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 23 +++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/tune.h diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c562bdef560bf..79f5355e532829 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -34,6 +34,7 @@ #include #include "sched.h" +#include "tune.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -4041,6 +4042,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + schedtune_enqueue_task(p, cpu_of(rq)); + /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4110,6 +4113,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { sub_nr_running(rq, 1); + schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -4940,10 +4944,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) } static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost = get_sysctl_sched_cfs_boost(); + unsigned int boost; +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_cpu_boost(cpu); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif if (boost == 0) return 0; @@ -4953,7 +4962,7 @@ schedtune_cpu_margin(unsigned long util) #else /* CONFIG_SCHED_TUNE */ static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } @@ -4964,7 +4973,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util); + unsigned long margin = schedtune_cpu_margin(util, cpu); return util + margin; } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index be60b8d97dc4de..ccc3540dcaf2b0 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "sched.h" @@ -158,6 +159,87 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg; + int tasks; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* Update boosted tasks count while avoiding to make it negative */ + if (task_count < 0 && bg->group[idx].tasks <= -task_count) + bg->group[idx].tasks = 0; + else + bg->group[idx].tasks += task_count; + + /* Boost group activation or deactivation on that RQ */ + tasks = bg->group[idx].tasks; + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, 1); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue will be done by cgroup exit() callback. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, -1); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 00000000000000..561b5171a19b51 --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,23 @@ + +#ifdef CONFIG_SCHED_TUNE + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +int schedtune_cpu_boost(int cpu); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_SCHED_TUNE */ From 68e8942fc327d243232e390f90c50c4420518eb8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:31:53 +0000 Subject: [PATCH 210/420] sched/fair: add boosted task utilization The task utilization signal, which is derived from PELT signals and properly scaled to be architecture and frequency invariant, is used by EAS as an estimation of the task requirements in terms of CPU bandwidth. When the energy aware scheduler is in use, this signal affects the CPU selection. Thus, a convenient way to bias that decision, which is also little intrusive, is to boost the task utilization signal each time it is required to support them. This patch introduces the new function: boosted_task_util(task) which returns a boosted value for the utilization of the specified task. The margin added to the original utilization is: 1. computed based on the "boosting strategy" in use 2. proportional to boost value defined either by the sysctl interface, when global boosting is in use, or the "taskgroup" value, when per-task boosting is enabled. The boosted signal is used by EAS a. transparently, via its integration into the task_fits() function b. explicitly, in the energy-aware wakeup path Change-Id: I4d801547278b0c34f85de7cc2103c1337c08029f Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/tune.c | 14 ++++++++++++++ kernel/sched/tune.h | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79f5355e532829..f1e9fecd1dfefb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4874,11 +4874,13 @@ static inline unsigned long task_util(struct task_struct *p) unsigned int capacity_margin = 1280; /* ~20% margin */ +static inline unsigned long boosted_task_util(struct task_struct *task); + static inline bool __task_fits(struct task_struct *p, int cpu, int util) { unsigned long capacity = capacity_of(cpu); - util += task_util(p); + util += boosted_task_util(p); return (capacity * 1024) > (util * capacity_margin); } @@ -4959,6 +4961,27 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } +static inline unsigned long +schedtune_task_margin(struct task_struct *task) +{ + unsigned int boost; + unsigned long util; + unsigned long margin; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + #else /* CONFIG_SCHED_TUNE */ static inline unsigned int @@ -4967,6 +4990,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return 0; } +static inline unsigned int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ static inline unsigned long @@ -4978,6 +5007,15 @@ boosted_cpu_util(int cpu) return util + margin; } +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + unsigned long margin = schedtune_task_margin(task); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5212,7 +5250,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + task_util(p); + int new_util = cpu_util(i) + boosted_task_util(p); if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index ccc3540dcaf2b0..3253a8732ba575 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -240,6 +240,20 @@ int schedtune_cpu_boost(int cpu) return bg->boost_max; } +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 561b5171a19b51..d756ce7b06e08f 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -4,6 +4,7 @@ #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); From 7bbe0b833740309cb2287d3938910f7bf35cfe0e Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:35:13 +0000 Subject: [PATCH 211/420] sched/fair: keep track of energy/capacity variations The current EAS implementation does not allow "to boost" tasks performances, for example by running them at an higher OPP (or a more capable CPU), even if that could require a "reasonable" increase in energy consumption. To defined how much reasonable is an energy increase with respect to a required boost value, it is required to define and compute a trade-off between the expected energy and performance variations. However, the current EAS implementation considers only energy variations while completely disregard the impact on performance for the selection of a certain schedule candidate. This patch extends the eenv energy environment to keep track of both energy and performance deltas which are implied by the activation of a schedule candidate. The performance variation is estimated considering the different capacities of the CPUs in which the task could be scheduled. The idea is that while running on a CPU with higher capacity (e.g. higher operating point) the task could (potentially) complete faster and thus get better performance. Change-Id: Ia2b0a1659aea4c66bf87cba3fba6dbf03543b275 Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f1e9fecd1dfefb..de5a0a3710c6c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4532,6 +4532,16 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + struct { + int before; + int after; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; }; /* @@ -4698,6 +4708,22 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) @@ -4746,6 +4772,8 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0 }, + .cap = { 0, 0, 0 }, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -4767,13 +4795,21 @@ static int energy_diff(struct energy_env *eenv) return 0; /* Invalid result abort */ energy_before += eenv_before.energy; + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + if (sched_group_energy(eenv)) return 0; /* Invalid result abort */ energy_after += eenv->energy; } } while (sg = sg->next, sg != sd->groups); - return energy_after-energy_before; + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + + return eenv->nrg.diff; } /* From 91da8a093246568cc3a1dc6ddfd7bdd4f565dcc1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 12 Jan 2016 18:12:13 +0000 Subject: [PATCH 212/420] sched/tune: add support to compute normalized energy The current EAS implementation considers only energy variations, while it disregards completely the impact on performance for the selection of a certain schedule candidate. Moreover, it also makes its decision based on the "absolute" value of expected energy variations. In order to properly define a trade-off strategy between increased energy consumption and performances benefits it is required to compare energy variations with performance variations. Thus, both performance and energy metrics must be expressed in comparable units. While the performance variations are expressed in terms of capacity deltas, which are defined in the range [0..SCHED_LOAD_SCALE], the same scale is not used for energy variations. This patch introduces the function: schedtune_normalize_energy(energy_diff) which returns a normalized value in the same range of capacity variations, i.e. [0..SCHED_LOAD_SCALE]. A proper set of energy normalization constants are required to provide a fast division by a constant during the normalziation of the energy_diff. The value of these constants depends on the specific energy model and topology of a target device. Thus, this patch provides also the required support for the computation at boot time of this set of variables. Change-Id: I7a740ebc05383c62de6079c0cf866f05f8ac5ef4 Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 321 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 7 + 2 files changed, 328 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 3253a8732ba575..f4fbbcd28373f4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,9 @@ #include #include +#include #include #include +#include #include #include @@ -9,6 +11,84 @@ unsigned int sysctl_sched_cfs_boost __read_mostly; +/* + * System energy normalization constants + */ +static struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +} schedtune_target_nrg; + +/* Performance Boost region (B) threshold params */ +static int perf_boost_idx; + +/* Performance Constraint region (C) threshold params */ +static int perf_constrain_idx; + +/** + * Performance-Energy (P-E) Space thresholds constants + */ +struct threshold_params { + int nrg_gain; + int cap_gain; +}; + +/* + * System specific P-E space thresholds constants + */ +static struct threshold_params +threshold_gains[] = { + { 0, 4 }, /* >= 0% */ + { 0, 4 }, /* >= 10% */ + { 1, 4 }, /* >= 20% */ + { 2, 4 }, /* >= 30% */ + { 3, 4 }, /* >= 40% */ + { 4, 3 }, /* >= 50% */ + { 4, 2 }, /* >= 60% */ + { 4, 1 }, /* >= 70% */ + { 4, 0 }, /* >= 80% */ + { 4, 0 } /* >= 90% */ +}; + +static int +__schedtune_accept_deltas(int nrg_delta, int cap_delta, + int perf_boost_idx, int perf_constrain_idx) +{ + int payoff = -INT_MAX; + + /* Performance Boost (B) region */ + if (nrg_delta > 0 && cap_delta > 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta < cap_gain / nrg_gain + * which is: + * nrg_delta * cap_gain > cap_delta * nrg_gain + */ + payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; + payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + return payoff; + } + + /* Performance Constraint (C) region */ + if (nrg_delta < 0 && cap_delta < 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta > cap_gain / nrg_gain + * which is: + * cap_delta * nrg_gain > nrg_delta * cap_gain + */ + payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + return payoff; + } + + /* Default: reject schedule candidate */ + return payoff; +} + #ifdef CONFIG_CGROUP_SCHEDTUNE /* @@ -26,6 +106,11 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; + /* Performance Boost (B) region threshold params */ + int perf_boost_idx; + + /* Performance Constraint (C) region threshold params */ + int perf_constrain_idx; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, + .perf_boost_idx = 0, + .perf_constrain_idx = 0, }; +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + struct schedtune *ct; + int perf_boost_idx; + int perf_constrain_idx; + + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + /* Get task specific perf Boost/Constraints indexes */ + rcu_read_lock(); + ct = task_schedtune(task); + perf_boost_idx = ct->perf_boost_idx; + perf_constrain_idx = ct->perf_constrain_idx; + rcu_read_unlock(); + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of @@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + #endif /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -408,6 +540,195 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* Performance Boost (B) region threshold params */ + perf_boost_idx = sysctl_sched_cfs_boost; + perf_boost_idx /= 10; + + /* Performance Constraint (C) region threshold params */ + perf_constrain_idx = 100 - sysctl_sched_cfs_boost; + perf_constrain_idx /= 10; + + return 0; +} + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +int +schedtune_normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + int max_delta; + +#ifdef CONFIG_SCHED_DEBUG + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +#ifdef CONFIG_SCHED_DEBUG +static void +schedtune_test_nrg(unsigned long delta_pwr) +{ + unsigned long test_delta_pwr; + unsigned long test_norm_pwr; + int idx; + + /* + * Check normalization constants using some constant system + * energy values + */ + pr_info("schedtune: verify normalization constants...\n"); + for (idx = 0; idx < 6; ++idx) { + test_delta_pwr = delta_pwr >> idx; + + /* Normalize on max energy for target platform */ + test_norm_pwr = reciprocal_divide( + test_delta_pwr << SCHED_LOAD_SHIFT, + schedtune_target_nrg.rdiv); + + pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", + idx, test_delta_pwr, test_norm_pwr); + } +} +#else +#define schedtune_test_nrg(delta_pwr) +#endif + +/* + * Compute the min/max power consumption of a cluster and all its CPUs + */ +static void +schedtune_add_cluster_nrg( + struct sched_domain *sd, + struct sched_group *sg, + struct target_nrg *ste) +{ + struct sched_domain *sd2; + struct sched_group *sg2; + + struct cpumask *cluster_cpus; + char str[32]; + + unsigned long min_pwr; + unsigned long max_pwr; + int cpu; + + /* Get Cluster energy using EM data for the first CPU */ + cluster_cpus = sched_group_cpus(sg); + snprintf(str, 32, "CLUSTER[%*pbl]", + cpumask_pr_args(cluster_cpus)); + + min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; + max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Keep track of this cluster's energy in the computation of the + * overall system energy + */ + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + /* Get CPU energy using EM data for each CPU in the group */ + for_each_cpu(cpu, cluster_cpus) { + /* Get a SD view for the specific CPU */ + for_each_domain(cpu, sd2) { + /* Get the CPU group */ + sg2 = sd2->groups; + min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; + max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; + + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + snprintf(str, 32, "CPU[%d]", cpu); + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Assume we have EM data only at the CPU and + * the upper CLUSTER level + */ + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); + break; + } + } +} + +/* + * Initialize the constants required to compute normalized energy. + * The values of these constants depends on the EM data for the specific + * target system and topology. + * Thus, this function is expected to be called by the code + * that bind the EM to the topology information. + */ +static int +schedtune_init_late(void) +{ + struct target_nrg *ste = &schedtune_target_nrg; + unsigned long delta_pwr = 0; + struct sched_domain *sd; + struct sched_group *sg; + + pr_info("schedtune: init normalization constants...\n"); + ste->max_power = 0; + ste->min_power = 0; + + rcu_read_lock(); + + /* + * When EAS is in use, we always have a pointer to the highest SD + * which provides EM data. + */ + sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); + if (!sd) { + pr_info("schedtune: no energy model data\n"); + goto nodata; + } + + sg = sd->groups; + do { + schedtune_add_cluster_nrg(sd, sg, ste); + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + "SYSTEM", ste->min_power, ste->max_power); + + /* Compute normalization constants */ + delta_pwr = ste->max_power - ste->min_power; + ste->rdiv = reciprocal_value(delta_pwr); + pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", + ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); + + schedtune_test_nrg(delta_pwr); return 0; + +nodata: + rcu_read_unlock(); + return -EINVAL; } +late_initcall(schedtune_init_late); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index d756ce7b06e08f..da1f7b288aa09d 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -9,6 +9,10 @@ int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); +int schedtune_normalize_energy(int energy); +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + #else /* CONFIG_CGROUP_SCHEDTUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) @@ -21,4 +25,7 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_normalize_energy(energy) energy +#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta + #endif /* CONFIG_SCHED_TUNE */ From 0a095b9b270185d8e9717d2228cf7873771c9fe4 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 15 Jan 2016 15:48:03 +0000 Subject: [PATCH 213/420] sched/fair: filter energy_diff() based on energy_payoff value Once the SchedTune support is enabled and the CPU bandwidth demand of a task is boosted, we could expect increased energy consumptions which are balanced by corresponding increases of tasks performance. However, the current implementation of the energy_diff() function accepts all and _only_ the schedule candidates which results into a reduced expected system energy, which works against the boosting strategy. This patch links the energy_diff() function with the "energy payoff" engine provided by SchedTune. The energy variation computed by the energy_diff() function is now filtered using the SchedTune support to evaluated the energy payoff for a boosted task. With that patch, the energy_diff() function is going to reported as "acceptable schedule candidate" only the schedule candidate which corresponds to a positive energy_payoff. Change-Id: Ib0a5c5f37cfd6dd4d244cac633d3a36b99bae0c6 Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index de5a0a3710c6c8..b7f51c48146cd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4532,9 +4532,12 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + int payoff; + struct task_struct *task; struct { int before; int after; + int delta; int diff; } nrg; struct { @@ -4755,6 +4758,44 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +#ifdef CONFIG_SCHED_TUNE +static int energy_diff_evaluate(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff_evaluate(eenv) eenv->nrg.diff +#endif + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -4772,7 +4813,7 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, - .nrg = { 0, 0, 0 }, + .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, }; @@ -4808,8 +4849,9 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.before = energy_before; eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; - return eenv->nrg.diff; + return energy_diff_evaluate(eenv); } /* @@ -5307,6 +5349,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) .util_delta = task_util(p), .src_cpu = task_cpu(p), .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ From 94b81d0308749685be95524b502b3fd5714180b9 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 15 Jan 2016 19:16:14 +0000 Subject: [PATCH 214/420] FIXUP: sched/tune: properly account of exiting tasks (only for kernel < v4.4) Once a task is going to end the CGroup::exit() callback is required to properly update the counting of tasks active in the original boostgroup. NOTE: This callback is NOT required after the API changes introduced in kernels >=v4.4 by this patch: 2e91fa7f6 cgroup: keep zombies associated with their original cgroups Change-Id: I4dc6b77a9621003ac2269f30ea44c8b024fbc249 Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f4fbbcd28373f4..0f3f99d2045d3c 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -503,9 +503,21 @@ schedtune_css_free(struct cgroup_subsys_state *css) kfree(st); } +static void +schedtune_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *tsk) +{ + struct schedtune *old_st = css_st(old_css); + int cpu = task_cpu(tsk); + + schedtune_tasks_update(tsk, cpu, old_st->idx, -1); +} + struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, + .exit = schedtune_exit, .legacy_cftypes = files, .early_init = 1, }; From 4dca8707d8c8575b5d8912f396767554cf3062e3 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:06:24 +0000 Subject: [PATCH 215/420] DEBUG: sched: add tracepoint for cpu/freq scale invariance Change-Id: Ia3138469039c74bbb34486135da9f1ec033842c2 Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 24 ++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 4e82758f43b2af..173d8d460998a0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -599,6 +599,30 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +TRACE_EVENT(sched_contrib_scale_f, + + TP_PROTO(int cpu, unsigned long freq_scale_factor, + unsigned long cpu_scale_factor), + + TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, freq_scale_factor) + __field(unsigned long, cpu_scale_factor) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->freq_scale_factor = freq_scale_factor; + __entry->cpu_scale_factor = cpu_scale_factor; + ), + + TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu", + __entry->cpu, __entry->freq_scale_factor, + __entry->cpu_scale_factor) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7f51c48146cd0..318d0507ef88a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2349,6 +2349,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; From f8d8705009327a23adf749c5a03f1d36966a9363 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:27 +0000 Subject: [PATCH 216/420] DEBUG: sched: add tracepoint for task load/util signals Change-Id: Ia40312601bc15570de3dd84bd72dc9c6000ee19c Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 43 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 3 +++ 2 files changed, 46 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 173d8d460998a0..fdc5a6d0427d31 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -623,6 +623,49 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu, __entry->freq_scale_factor, __entry->cpu_scale_factor) ); + +/* + * Tracepoint for accounting sched averages for tasks. + */ +TRACE_EVENT(sched_load_avg_task, + + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + __field( u64, load_sum ) + __field( u32, util_sum ) + __field( u32, period_contrib ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->util_avg = avg->util_avg; + __entry->load_sum = avg->load_sum; + __entry->util_sum = avg->util_sum; + __entry->period_contrib = avg->period_contrib; + ), + + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu" + " util_sum=%u period_contrib=%u", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->load_avg, + __entry->util_avg, + (u64)__entry->load_sum, + (u32)__entry->util_sum, + (u32)__entry->period_contrib) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 318d0507ef88a4..cd4256ab383822 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2545,6 +2545,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + trace_sched_load_avg_task(task_of(se), &se->avg); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From 0a6854564cdcbc6ddf084a24efb95f7dd2d57fe2 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:48 +0000 Subject: [PATCH 217/420] DEBUG: sched: add tracepoint for CPU load/util signals Change-Id: Ibe7c3f8d17f14e9466df215b10f33b065520b7b4 Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 25 +++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fdc5a6d0427d31..decde31af75370 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -666,6 +666,31 @@ TRACE_EVENT(sched_load_avg_task, (u32)__entry->util_sum, (u32)__entry->period_contrib) ); + +/* + * Tracepoint for accounting sched averages for cpus. + */ +TRACE_EVENT(sched_load_avg_cpu, + + TP_PROTO(int cpu, struct cfs_rq *cfs_rq), + + TP_ARGS(cpu, cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load_avg = cfs_rq->avg.load_avg; + __entry->util_avg = cfs_rq->avg.util_avg; + ), + + TP_printk("cpu=%d load_avg=%lu util_avg=%lu", + __entry->cpu, __entry->load_avg, __entry->util_avg) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cd4256ab383822..37ec158e5f3c79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2548,6 +2548,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); + trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From cadd6e99b407a0025e4e9d093622e895817330ad Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 17:35:23 +0100 Subject: [PATCH 218/420] DEBUG: sched,cpufreq: add cpu_capacity change tracepoint This is useful when we want to compare cpu utilization and cpu curr capacity side by side. Change-Id: Icd0930d11068fcb7d2b6a9a48e7ed974904e1081 Signed-off-by: Juri Lelli --- drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/sched.h | 2 ++ include/trace/events/power.h | 8 ++++++++ kernel/sched/fair.c | 11 +++++++++++ kernel/sched/sched.h | 11 ----------- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1e321ca24f252b..19c816bb9bdef6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include /** @@ -404,6 +405,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { + int cpu; /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -432,6 +434,8 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); + for_each_cpu(cpu, policy->cpus) + trace_cpu_capacity(capacity_curr_of(cpu), cpu); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 69e8ba15b6d8d7..35268290a7745c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -938,6 +938,8 @@ struct sched_group_energy { struct capacity_state *cap_states; /* ptr to capacity state array */ }; +unsigned long capacity_curr_of(int cpu); + struct sched_group; struct sched_domain { diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 33babbb119e118..28ad66f6deedcf 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -136,7 +136,15 @@ TRACE_EVENT(cpu_frequency_limits, (unsigned long)__entry->cpu_id) ); +DEFINE_EVENT(cpu, cpu_capacity, + + TP_PROTO(unsigned int capacity, unsigned int cpu_id), + + TP_ARGS(capacity, cpu_id) +); + TRACE_EVENT(device_pm_callback_start, + TP_PROTO(struct device *dev, const char *pm_ops, int event), TP_ARGS(dev, pm_ops, event), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 37ec158e5f3c79..aaa522c2385a40 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4524,6 +4524,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 413d2c04be57a6..a69eb5a1324731 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1497,17 +1497,6 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static inline unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From d4df36b9cb880b9795feca6cd893c7220a10d27e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:25:50 +0000 Subject: [PATCH 219/420] DEBUG: sched: add energy procfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the energy data available via procfs. The related files are placed as sub-directory named 'energy' inside the /proc/sys/kernel/sched_domain/cpuX/domainY/groupZ directory for those cpu/domain/group tuples which have energy information. The following example depicts the contents of /proc/sys/kernel/sched_domain/cpu0/domain0/group[01] for a system which has energy information attached to domain level 0. ├── cpu0 │ ├── domain0 │ │ ├── busy_factor │ │ ├── busy_idx │ │ ├── cache_nice_tries │ │ ├── flags │ │ ├── forkexec_idx │ │ ├── group0 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── group1 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── idle_idx │ │ ├── imbalance_pct │ │ ├── max_interval │ │ ├── max_newidle_lb_cost │ │ ├── min_interval │ │ ├── name │ │ ├── newidle_idx │ │ └── wake_idx │ └── domain1 │ ├── busy_factor │ ├── busy_idx │ ├── cache_nice_tries │ ├── flags │ ├── forkexec_idx │ ├── idle_idx │ ├── imbalance_pct │ ├── max_interval │ ├── max_newidle_lb_cost │ ├── min_interval │ ├── name │ ├── newidle_idx │ └── wake_idx The files 'nr_idle_states' and 'nr_cap_states' contain a scalar value whereas 'idle_states' and 'cap_states' contain a vector of power consumption at this idle state respectively (compute capacity, power consumption) at this capacity state. Change-Id: I7457d4073b4797336a30711407220bd2eb111396 Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d307a0e898df0..97a6d57519f973 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5173,10 +5173,61 @@ set_table_entry(struct ctl_table *entry, } } +static struct ctl_table * +sd_alloc_ctl_energy_table(struct sched_group_energy *sge) +{ + struct ctl_table *table = sd_alloc_ctl_entry(5); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power, + sge->nr_idle_states*sizeof(struct idle_state), 0644, + proc_doulongvec_minmax, false); + set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap, + sge->nr_cap_states*sizeof(struct capacity_state), 0644, + proc_doulongvec_minmax, false); + + return table; +} + +static struct ctl_table * +sd_alloc_ctl_group_table(struct sched_group *sg) +{ + struct ctl_table *table = sd_alloc_ctl_entry(2); + + if (table == NULL) + return NULL; + + table->procname = kstrdup("energy", GFP_KERNEL); + table->mode = 0555; + table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge); + + return table; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table; + unsigned int nr_entries = 14; + + int i = 0; + struct sched_group *sg = sd->groups; + + if (sg->sge) { + int nr_sgs = 0; + + do {} while (nr_sgs++, sg = sg->next, sg != sd->groups); + + nr_entries += nr_sgs; + } + + table = sd_alloc_ctl_entry(nr_entries); if (table == NULL) return NULL; @@ -5209,7 +5260,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + sg = sd->groups; + if (sg->sge) { + char buf[32]; + struct ctl_table *entry = &table[13]; + + do { + snprintf(buf, 32, "group%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_group_table(sg); + } while (entry++, i++, sg = sg->next, sg != sd->groups); + } + /* &table[nr_entries-1] is terminator */ return table; } From 0c34a39445ef52bcf07d79d62ea569d10e5bc958 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 11:16:57 +0000 Subject: [PATCH 220/420] DEBUG: sched/tune: add tracepoint for SchedTune configuration update Change-Id: I79bf835a1c109d4e1d7c71c2a0e86e2a21c0874b Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 34 ++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 8 ++++++++ 2 files changed, 42 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index decde31af75370..59ea35adb77df0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -691,6 +691,40 @@ TRACE_EVENT(sched_load_avg_cpu, TP_printk("cpu=%d load_avg=%lu util_avg=%lu", __entry->cpu, __entry->load_avg, __entry->util_avg) ); + +/* + * Tracepoint for sched_tune_config settings + */ +TRACE_EVENT(sched_tune_config, + + TP_PROTO(int boost, int pb_nrg_gain, int pb_cap_gain, int pc_nrg_gain, int pc_cap_gain), + + TP_ARGS(boost, pb_nrg_gain, pb_cap_gain, pc_nrg_gain, pc_cap_gain), + + TP_STRUCT__entry( + __field( int, boost ) + __field( int, pb_nrg_gain ) + __field( int, pb_cap_gain ) + __field( int, pc_nrg_gain ) + __field( int, pc_cap_gain ) + ), + + TP_fast_assign( + __entry->boost = boost; + __entry->pb_nrg_gain = pb_nrg_gain; + __entry->pb_cap_gain = pb_cap_gain; + __entry->pc_nrg_gain = pc_nrg_gain; + __entry->pc_cap_gain = pc_cap_gain; + ), + + TP_printk("boost=%d " + "pb_nrg_gain=%d pb_cap_gain=%d " + "pc_nrg_gain=%d pc_cap_gain=%d", + __entry->boost, + __entry->pb_nrg_gain, __entry->pb_cap_gain, + __entry->pc_nrg_gain, __entry->pc_cap_gain) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 0f3f99d2045d3c..0b77684d9e3af2 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -7,6 +7,8 @@ #include #include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; @@ -392,6 +394,12 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); + trace_sched_tune_config(st->boost, + threshold_gains[st->perf_boost_idx].nrg_gain, + threshold_gains[st->perf_boost_idx].cap_gain, + threshold_gains[st->perf_constrain_idx].nrg_gain, + threshold_gains[st->perf_constrain_idx].cap_gain); + return 0; } From dbd5f9b65ed371ee3e9cd67a2899b36e0e6f4f24 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:51:07 +0100 Subject: [PATCH 221/420] DEBUG: sched/tune: add tracepoint for CPU boost signal Change-Id: Ib4f9391c105d439acdb75bfc5b4c9506ad7d2956 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 29 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 59ea35adb77df0..42aff33097d0da 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -725,6 +725,33 @@ TRACE_EVENT(sched_tune_config, __entry->pc_nrg_gain, __entry->pc_cap_gain) ); +/* + * Tracepoint for accounting CPU boosted utilization + */ +TRACE_EVENT(sched_boost_cpu, + + TP_PROTO(int cpu, unsigned long util, unsigned long margin), + + TP_ARGS(cpu, util, margin), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("cpu=%d util=%lu margin=%lu", + __entry->cpu, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aaa522c2385a40..862caf36a19eaa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5098,6 +5098,8 @@ boosted_cpu_util(int cpu) unsigned long util = cpu_util(cpu); unsigned long margin = schedtune_cpu_margin(util, cpu); + trace_sched_boost_cpu(cpu, util, margin); + return util + margin; } From a852ecc36c776e79326f7a02b5a80be786adf827 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 11:34:25 +0000 Subject: [PATCH 222/420] DEBUG: sched/tune: add tracepoint for schedtune_tasks_update() values Change-Id: I66dd659b519da093d34ceb92abed49e885afa2fd Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 37 ++++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 4 ++++ 2 files changed, 41 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 42aff33097d0da..57d48b5335a641 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -752,6 +752,43 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_tasks_update, + + TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, + unsigned int boost, unsigned int max_boost), + + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, tasks ) + __field( int, idx ) + __field( unsigned int, boost ) + __field( unsigned int, max_boost ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->tasks = tasks; + __entry->idx = idx; + __entry->boost = boost; + __entry->max_boost = max_boost; + ), + + TP_printk("pid=%d comm=%s " + "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + __entry->pid, __entry->comm, + __entry->cpu, __entry->tasks, __entry->idx, + __entry->boost, __entry->max_boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 0b77684d9e3af2..48938553c306f2 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -293,6 +293,10 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) tasks = bg->group[idx].tasks; if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + } /* From 021fd9f2c1b9bcf5cf0276fae9348908799e17de Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 11:38:30 +0000 Subject: [PATCH 223/420] DEBUG: sched/tune: add tracepoint on boostgroup updates Change-Id: I7a1bb15bd17111885e2db3bdfced8a3d4a9410e5 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 25 +++++++++++++++++++++++++ kernel/sched/tune.c | 8 +++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 57d48b5335a641..8ce0029a72661c 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -789,6 +789,31 @@ TRACE_EVENT(sched_tune_tasks_update, __entry->boost, __entry->max_boost) ); +/* + * Tracepoint for schedtune_boostgroup_update + */ +TRACE_EVENT(sched_tune_boostgroup_update, + + TP_PROTO(int cpu, int variation, int max_boost), + + TP_ARGS(cpu, variation, max_boost), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( int, variation ) + __field( int, max_boost ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->variation = variation; + __entry->max_boost = max_boost; + ), + + TP_printk("cpu=%d variation=%d max_boost=%d", + __entry->cpu, __entry->variation, __entry->max_boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 48938553c306f2..7d00f6f31de257 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -264,12 +264,18 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ - if (cur_boost_max == old_boost && old_boost > boost) + if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); + continue; + } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; From 79f7d6331a61fd18fec46363a3cdd892d2fb8337 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:43:37 +0000 Subject: [PATCH 224/420] DEBUG: sched/tune: add tracepoint for task boost signal Change-Id: I545d3bf5569fc41c0fa70f51dff9a19c11d532ee Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 30 ++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 32 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8ce0029a72661c..8247cdaeef0bae 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -814,6 +814,36 @@ TRACE_EVENT(sched_tune_boostgroup_update, __entry->cpu, __entry->variation, __entry->max_boost) ); +/* + * Tracepoint for accounting task boosted utilization + */ +TRACE_EVENT(sched_boost_task, + + TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + + TP_ARGS(tsk, util, margin), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("comm=%s pid=%d util=%lu margin=%lu", + __entry->comm, __entry->pid, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 862caf36a19eaa..3c7818eb1b968e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5109,6 +5109,8 @@ boosted_task_util(struct task_struct *task) unsigned long util = task_util(task); unsigned long margin = schedtune_task_margin(task); + trace_sched_boost_task(task, util, margin); + return util + margin; } From 6aab3e4a9266fb661638e37ad4745573aff6de5c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:47:21 +0000 Subject: [PATCH 225/420] DEBUG: sched/tune: add tracepoint for energy_diff() values Change-Id: Id8fafbd85f6d81248f322e073ee790a7ceec0bf7 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 57 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8247cdaeef0bae..c618c501a1a285 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -844,6 +844,63 @@ TRACE_EVENT(sched_boost_task, __entry->margin) ); +/* + * Tracepoint for accounting sched group energy + */ +TRACE_EVENT(sched_energy_diff, + + TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta, + int nrgb, int nrga, int nrgd, int capb, int capa, int capd, + int nrgn, int nrgp), + + TP_ARGS(tsk, scpu, dcpu, udelta, + nrgb, nrga, nrgd, capb, capa, capd, + nrgn, nrgp), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, scpu ) + __field( int, dcpu ) + __field( int, udelta ) + __field( int, nrgb ) + __field( int, nrga ) + __field( int, nrgd ) + __field( int, capb ) + __field( int, capa ) + __field( int, capd ) + __field( int, nrgn ) + __field( int, nrgp ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->scpu = scpu; + __entry->dcpu = dcpu; + __entry->udelta = udelta; + __entry->nrgb = nrgb; + __entry->nrga = nrga; + __entry->nrgd = nrgd; + __entry->capb = capb; + __entry->capa = capa; + __entry->capd = capd; + __entry->nrgn = nrgn; + __entry->nrgp = nrgp; + ), + + TP_printk("pid=%d comm=%s " + "src_cpu=%d dst_cpu=%d usage_delta=%d " + "nrg_before=%d nrg_after=%d nrg_diff=%d " + "cap_before=%d cap_after=%d cap_delta=%d " + "nrg_delta=%d nrg_payoff=%d", + __entry->pid, __entry->comm, + __entry->scpu, __entry->dcpu, __entry->udelta, + __entry->nrgb, __entry->nrga, __entry->nrgd, + __entry->capb, __entry->capa, __entry->capd, + __entry->nrgn, __entry->nrgp) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c7818eb1b968e..110de4b244c8fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4824,6 +4824,7 @@ static int energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int result; struct energy_env eenv_before = { .util_delta = 0, @@ -4867,7 +4868,15 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - return energy_diff_evaluate(eenv); + result = energy_diff_evaluate(eenv); + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + return result; } /* From 6aff3a0945ef26c926c89375908e660492f528c6 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 20 Jan 2016 14:06:05 +0000 Subject: [PATCH 226/420] DEBUG: sched/tune: add tracepoint on P-E space filtering Change-Id: I31dfed67c0486713b88efb75df767329f2802e06 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c618c501a1a285..7c783d2328d346 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -901,6 +901,41 @@ TRACE_EVENT(sched_energy_diff, __entry->nrgn, __entry->nrgp) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_filter, + + TP_PROTO(int nrg_delta, int cap_delta, + int nrg_gain, int cap_gain, + int payoff, int region), + + TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region), + + TP_STRUCT__entry( + __field( int, nrg_delta ) + __field( int, cap_delta ) + __field( int, nrg_gain ) + __field( int, cap_gain ) + __field( int, payoff ) + __field( int, region ) + ), + + TP_fast_assign( + __entry->nrg_delta = nrg_delta; + __entry->cap_delta = cap_delta; + __entry->nrg_gain = nrg_gain; + __entry->cap_gain = cap_gain; + __entry->payoff = payoff; + __entry->region = region; + ), + + TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d", + __entry->nrg_delta, __entry->cap_delta, + __entry->nrg_gain, __entry->cap_gain, + __entry->payoff, __entry->region) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 7d00f6f31de257..6edfd44de23d67 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -70,6 +70,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_boost_idx].nrg_gain, + threshold_gains[perf_boost_idx].cap_gain, + payoff, 8); + return payoff; } @@ -84,6 +91,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_constrain_idx].nrg_gain, + threshold_gains[perf_constrain_idx].cap_gain, + payoff, 6); + return payoff; } @@ -155,12 +169,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_constrain_idx; /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } /* Get task specific perf Boost/Constraints indexes */ rcu_read_lock(); @@ -547,12 +565,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) { /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } return __schedtune_accept_deltas(nrg_delta, cap_delta, perf_boost_idx, perf_constrain_idx); From 688c13aea84e0f726d1ec3f107b53251182dfd80 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 22 Jul 2016 11:35:59 +0100 Subject: [PATCH 227/420] FIXUP: sched: fix build for non-SMP target Currently the build for a single-core (e.g. user-mode) Linux is broken and this configuration is required (at least) to run some network tests. The main issues for the current code support on single-core systems are: 1. {se,rq}::sched_avg is not available nor maintained for !SMP systems This means that load and utilisation signals are NOT available in single core systems. All the EAS code depends on these signals. 2. sched_group_energy is also SMP dependant. Again this means that all the EAS setup and preparation code (energyn model initialization) has to be properly guarded/disabled for !SMP systems. 3. SchedFreq depends on utilization signal, which is not available on !SMP systems. 4. SchedTune is useless on unicore systems if SchedFreq is not available. 5. WALT machinery is not required on single-core systems. This patch addresses all these issues by enforcing some constraints for single-core systems: a) WALT, SchedTune and SchedTune are now dependant on SMP b) The default governor for !SMP systems is INTERACTIVE c) The energy model initialisation/build functions are d) Other minor code re-arrangements and CONFIG_SMP guarding to enable single core builds. Signed-off-by: Patrick Bellasi --- drivers/cpufreq/Kconfig | 1 + include/linux/sched_energy.h | 8 ++++++++ include/trace/events/sched.h | 4 ++++ init/Kconfig | 1 + kernel/sched/Makefile | 4 ++-- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++---- kernel/sched/sched.h | 3 +-- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 9ffc79f14e76af..33c9b600e86091 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -220,6 +220,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config CPU_FREQ_GOV_SCHED bool "'sched' cpufreq governor" depends on CPU_FREQ + depends on SMP select CPU_FREQ_GOV_COMMON help 'sched' - this governor scales cpu frequency from the diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h index a3f1627ac609e0..1daf3e1f98a75c 100644 --- a/include/linux/sched_energy.h +++ b/include/linux/sched_energy.h @@ -29,8 +29,16 @@ #define for_each_possible_sd_level(level) \ for (level = 0; level < NR_SD_LEVELS; level++) +#ifdef CONFIG_SMP + extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; void init_sched_energy_costs(void); +#else + +#define init_sched_energy_costs() do { } while (0) + +#endif /* CONFIG_SMP */ + #endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7c783d2328d346..795f137a97787c 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -624,6 +624,8 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu_scale_factor) ); +#ifdef CONFIG_SMP + /* * Tracepoint for accounting sched averages for tasks. */ @@ -936,6 +938,8 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#endif /* CONFIG_SMP */ + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 7c9e6210c3883b..a362ebfe4135fd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1249,6 +1249,7 @@ config SCHED_AUTOGROUP config SCHED_TUNE bool "Boosting for CFS tasks (EXPERIMENTAL)" + depends on SMP help This option enables the system-wide support for task boosting. When this support is enabled a new sysctl interface is exposed to diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a17c2a58912e6d..dbf2b26e1c2650 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,9 +12,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 110de4b244c8fc..9773a8501c571e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3981,8 +3981,14 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util(cpu) +#endif +#ifdef CONFIG_SMP static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -3995,8 +4001,7 @@ static void update_capacity_of(int cpu) req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } - -static bool cpu_overutilized(int cpu); +#endif /* * The enqueue_task method is called before nr_running is @@ -4008,8 +4013,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; int task_wakeup = flags & ENQUEUE_WAKEUP; +#endif for_each_sched_entity(se) { if (se->on_rq) @@ -4041,8 +4048,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) add_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4059,6 +4070,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4116,8 +4129,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) sub_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { schedtune_dequeue_task(p, cpu_of(rq)); /* @@ -4135,6 +4152,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) set_cfs_cpu_capacity(cpu_of(rq), false, 0); } } + +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -5520,6 +5540,8 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -8565,10 +8587,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); +#ifdef CONFIG_SMP if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a69eb5a1324731..5388278f048ff3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1218,6 +1218,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); @@ -1299,8 +1300,6 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); - static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From 21218cc439baa267079935b371f414fe046018a4 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 1 Aug 2016 11:34:05 +0100 Subject: [PATCH 228/420] sched/cpufreq_sched: Consolidated update Contains: sched/cpufreq_sched: use shorter throttle for raising OPP Avoid cases where a brief drop in load causes a change to a low OPP for the full throttle period. Use a shorter throttle period for raising OPP than for lowering OPP. sched-freq: Fix handling of max/min frequency This reverts commit 9726142608f5b3bf5df4280243c9d324e692a510. Change-Id: Ia78095354f7ad9492f00deb509a2b45112361eda sched/cpufreq: Increasing throttle_down_nsec to 50ms Change-Id: I2d8969cf2a64fa719b9dd86f43f9dd14b1ff84fe sched-freq: make throttle times tunable Change-Id: I127879645367425b273441d7f0306bb15d5633cb Signed-off-by: Srinath Sridharan Signed-off-by: Todd Kjos Signed-off-by: Juri Lelli --- drivers/cpufreq/Kconfig | 2 +- kernel/sched/cpufreq_sched.c | 175 +++++++++++++++++++++++++++++++---- 2 files changed, 160 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 33c9b600e86091..a9fdec8597f747 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -114,7 +114,7 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_SCHED + select CPU_FREQ_GOV_INTERACTIVE help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index c72537c0c50019..e5ca1e3fba8e74 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -19,7 +19,8 @@ #include "sched.h" -#define THROTTLE_NSEC 50000000 /* 50ms default */ +#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ +#define THROTTLE_UP_NSEC 500000 /* 500us default */ struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; static bool __read_mostly cpufreq_driver_slow; @@ -33,8 +34,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); /** * gov_data - per-policy data internal to the governor - * @throttle: next throttling period expiry. Derived from throttle_nsec - * @throttle_nsec: throttle period length in nanoseconds + * @up_throttle: next throttling period expiry if increasing OPP + * @down_throttle: next throttling period expiry if decreasing OPP + * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP + * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP * @task: worker thread for dvfs transition that may block/sleep * @irq_work: callback used to wake up worker thread * @requested_freq: last frequency requested by the sched governor @@ -48,11 +51,14 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); * call down_write(policy->rwsem). */ struct gov_data { - ktime_t throttle; - unsigned int throttle_nsec; + ktime_t up_throttle; + ktime_t down_throttle; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; + int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -66,25 +72,29 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); up_write(&policy->rwsem); } -static bool finish_last_request(struct gov_data *gd) +static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) { ktime_t now = ktime_get(); - if (ktime_after(now, gd->throttle)) + ktime_t throttle = gd->requested_freq < cur_freq ? + gd->down_throttle : gd->up_throttle; + + if (ktime_after(now, throttle)) return false; while (1) { - int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + int usec_left = ktime_to_ns(ktime_sub(throttle, now)); usec_left /= NSEC_PER_USEC; trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); - if (ktime_after(now, gd->throttle)) + if (ktime_after(now, throttle)) return true; } } @@ -130,7 +140,7 @@ static int cpufreq_sched_thread(void *data) * if the frequency thread sleeps while waiting to be * unthrottled, start over to check for a newer request */ - if (finish_last_request(gd)) + if (finish_last_request(gd, policy->cur)) continue; last_request = new_request; cpufreq_sched_try_driver_target(policy, new_request); @@ -185,16 +195,21 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) goto out; freq_new = policy->freq_table[index_new].frequency; + if (freq_new > policy->max) + freq_new = policy->max; + + if (freq_new < policy->min) + freq_new = policy->min; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, gd->requested_freq); - if (freq_new == gd->requested_freq) goto out; @@ -248,10 +263,17 @@ static inline void clear_sched_freq(void) static_key_slow_dec(&__sched_freq); } +static struct attribute_group sched_attr_group_gov_pol; +static struct attribute_group *get_sysfs_attr(void) +{ + return &sched_attr_group_gov_pol; +} + static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) { struct gov_data *gd; int cpu; + int rc; for_each_cpu(cpu, policy->cpus) memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, @@ -261,13 +283,22 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (!gd) return -ENOMEM; - gd->throttle_nsec = policy->cpuinfo.transition_latency ? + gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? policy->cpuinfo.transition_latency : - THROTTLE_NSEC; + THROTTLE_UP_NSEC; + gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->throttle_nsec); + __func__, gd->up_throttle_nsec); policy->governor_data = gd; + gd->max = policy->max; + + rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + if (rc) { + pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); + goto err; + } + if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; gd->task = kthread_create(cpufreq_sched_thread, policy, @@ -304,6 +335,8 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } + sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + policy->governor_data = NULL; kfree(gd); @@ -320,6 +353,32 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) return 0; } +static void cpufreq_sched_limits(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", + policy->cpu, policy->min, policy->max, + policy->cur); + + if (!down_write_trylock(&policy->rwsem)) + return; + /* + * Need to keep track of highest max frequency for + * capacity calculations + */ + gd = policy->governor_data; + if (gd->max < policy->max) + gd->max = policy->max; + + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + + up_write(&policy->rwsem); +} + static int cpufreq_sched_stop(struct cpufreq_policy *policy) { int cpu; @@ -343,11 +402,95 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy, case CPUFREQ_GOV_STOP: return cpufreq_sched_stop(policy); case CPUFREQ_GOV_LIMITS: + cpufreq_sched_limits(policy); break; } return 0; } +/* Tunables */ +static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->up_throttle_nsec); +} + +static ssize_t store_up_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->up_throttle_nsec = val; + return count; +} + +static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->down_throttle_nsec); +} + +static ssize_t store_down_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->down_throttle_nsec = val; + return count; +} + +/* + * Create show/store routines + * - sys: One governor instance for complete SYSTEM + * - pol: One governor instance per struct cpufreq_policy + */ +#define show_gov_pol_sys(file_name) \ +static ssize_t show_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, char *buf) \ +{ \ + return show_##file_name(policy->governor_data, buf); \ +} + +#define store_gov_pol_sys(file_name) \ +static ssize_t store_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, const char *buf, size_t count) \ +{ \ + return store_##file_name(policy->governor_data, buf, count); \ +} + +#define gov_pol_attr_rw(_name) \ + static struct freq_attr _name##_gov_pol = \ + __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) + +#define show_store_gov_pol_sys(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name) +#define tunable_handlers(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name); \ + gov_pol_attr_rw(file_name) + +tunable_handlers(down_throttle_nsec); +tunable_handlers(up_throttle_nsec); + +/* Per policy governor instance */ +static struct attribute *sched_attributes_gov_pol[] = { + &up_throttle_nsec_gov_pol.attr, + &down_throttle_nsec_gov_pol.attr, + NULL, +}; + +static struct attribute_group sched_attr_group_gov_pol = { + .attrs = sched_attributes_gov_pol, + .name = "sched", +}; + #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED static #endif From 2cfa35b2a3b3d8a6493a8b6873e35f22878fc311 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 09:57:29 +0100 Subject: [PATCH 229/420] sched: EAS: take cstate into account when selecting idle core Introduce a new sysctl for this option, 'sched_cstate_aware'. When this is enabled, select_idle_sibling in CFS is modified to choose the idle CPU in the sibling group which has the lowest idle state index - idle state indexes are assumed to increase as sleep depth and hence wakeup latency increase. In this way, we attempt to minimise wakeup latency when an idle CPU is required. Signed-off-by: Srinath Sridharan Includes: sched: EAS: fix select_idle_sibling when sysctl_sched_cstate_aware is enabled, best_idle cpu will not be chosen in the original flow because it will goto done directly Bug: 30107557 Change-Id: Ie09c2e3960cafbb976f8d472747faefab3b4d6ac Signed-off-by: martin_liu --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 55 +++++++++++++++++++++++++++--------- kernel/sysctl.c | 7 +++++ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 9cad78e74e2c04..2bb557ba883ab8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9773a8501c571e..b1f1dcc1d07573 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,6 +51,7 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_cstate_aware = 1; /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5294,15 +5295,20 @@ static int select_idle_sibling(struct task_struct *p, int target) struct sched_domain *sd; struct sched_group *sg; int i = task_cpu(p); + int best_idle = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; - if (idle_cpu(target)) - return target; + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) + return target; - /* - * If the prevous cpu is cache affine and idle, don't be stupid. - */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -5315,18 +5321,41 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; + + if (i == target && new_usage <= capacity_curr_of(target)) + return target; + + if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { + best_idle = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } - target = cpumask_first_and(sched_group_cpus(sg), + target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); - goto done; + goto done; + } next: sg = sg->next; } while (sg != sd->groups); } + if (best_idle > 0) + target = best_idle; + done: return target; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5630f4282f9552..56c3dadd4b9097 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -305,6 +305,13 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_cstate_aware", + .data = &sysctl_sched_cstate_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, From 422453109816416a64870d080db5f67b679155e0 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 29 Jul 2016 14:04:11 +0100 Subject: [PATCH 230/420] sched/fair: add tunable to force selection at cpu granularity EAS assumes that clusters with smaller capacity cores are more energy-efficient. This may not be true on non-big-little devices, so EAS can make incorrect cluster selections when finding a CPU to wake. The "sched_is_big_little" hint can be used to cause a cpu-based selection instead of cluster-based selection. This change incorporates the addition of the sync hint enable patch EAS did not honour synchronous wakeup hints, a new sysctl is created to ask EAS to use this information when selecting a CPU. The control is called "sched_sync_hint_enable". Also contains: EAS: sched/fair: for SMP bias toward idle core with capacity For SMP devices, on wakeup bias towards idle cores that have capacity vs busy devices that need a higher OPP eas: favor idle cpus for boosted tasks BUG: 29533997 BUG: 29512132 Change-Id: I0cc9a1b1b88fb52916f18bf2d25715bdc3634f9c Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan eas/sched/fair: Favoring busy cpus with low OPPs BUG: 29533997 BUG: 29512132 Change-Id: I9305b3239698d64278db715a2e277ea0bb4ece79 Signed-off-by: Juri Lelli --- include/linux/sched/sysctl.h | 2 + kernel/sched/fair.c | 189 +++++++++++++++++++++++++++-------- kernel/sysctl.c | 14 +++ 3 files changed, 166 insertions(+), 39 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2bb557ba883ab8..53068acb7378ac 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_is_big_little; +extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1f1dcc1d07573..dd03f85d1bb09d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,7 +51,10 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_is_big_little = 0; +unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; + /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5360,7 +5363,97 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } -static int energy_aware_wake_cpu(struct task_struct *p, int target) +static inline int find_best_target(struct task_struct *p) +{ + int i, boosted; + int target_cpu = -1; + int target_capacity = 0; + int backup_capacity = 0; + int idle_cpu = -1; + int best_idle_cstate = INT_MAX; + int backup_cpu = -1; + unsigned long task_util_boosted, new_util; + + /* + * Favor 1) busy cpu with most capacity at current OPP + * 2) idle_cpu with capacity at current OPP + * 3) busy cpu with capacity at higher OPP + */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p); +#else + boosted = 0; +#endif + task_util_boosted = boosted_task_util(p); + for_each_cpu(i, tsk_cpus_allowed(p)) { + int cur_capacity = capacity_curr_of(i); + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util_boosted; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + + if (new_util > capacity_orig_of(i)) + continue; + + /* + * For boosted tasks we favor idle cpus unconditionally to + * improve latency. + */ + if (idle_idx >= 0 && boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + continue; + } + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + if (target_capacity == 0 || + target_capacity > cur_capacity) { + /* busy CPU with most capacity at current OPP */ + target_cpu = i; + target_capacity = cur_capacity; + } + } else if (!boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + } + } else if (backup_capacity == 0 || + backup_capacity > cur_capacity) { + /* first busy CPU with capacity at higher OPP */ + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + + if (!boosted && target_cpu < 0) { + target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; + } + + if (boosted && idle_cpu >= 0) + target_cpu = idle_cpu; + return target_cpu; +} + +static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) { struct sched_domain *sd; struct sched_group *sg, *sg_target; @@ -5368,6 +5461,14 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) int target_cpu = task_cpu(p); int i; + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); if (!sd) @@ -5376,50 +5477,60 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) sg = sd->groups; sg_target = sg; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + if (sysctl_sched_is_big_little) { /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - int new_util = cpu_util(i) + boosted_task_util(p); + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); - if (new_util > capacity_orig_of(i)) - continue; + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + boosted_task_util(p); - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; - } + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + } else { + /* + * Find a cpu with sufficient capacity + */ + int tmp_target = find_best_target(p); + if (tmp_target >= 0) + target_cpu = tmp_target; } if (target_cpu != task_cpu(p)) { @@ -5499,7 +5610,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu); + new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 56c3dadd4b9097..a12d54436942cc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -305,6 +305,20 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_is_big_little", + .data = &sysctl_sched_is_big_little, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From e139cedea92707052fdb95e6f27e64be7022a508 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 11 Mar 2016 16:44:16 -0800 Subject: [PATCH 231/420] sched/fair: add tunable to set initial task load The choice of initial task load upon fork has a large influence on CPU and OPP selection when scheduler-driven DVFS is in use. Make this tuneable by adding a new sysctl "sched_initial_task_util". If the sched governor is not used, the default remains at SCHED_LOAD_SCALE Otherwise, the value from the sysctl is used. This defaults to 0. Signed-off-by: "Todd Kjos " --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 5 ++++- kernel/sysctl.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 53068acb7378ac..4321305cb84c34 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,7 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; +extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd03f85d1bb09d..a70c179bb465cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -53,6 +53,7 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; /* @@ -686,7 +687,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_avg = sched_freq() ? + sysctl_sched_initial_task_util : + scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a12d54436942cc..84be91491c0c7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -319,6 +319,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_initial_task_util", + .data = &sysctl_sched_initial_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From 809222a0e5d30554f657ad5da5bd63913aed3e6b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 16:39:27 +0100 Subject: [PATCH 232/420] FIX: sched/tune: update usage of boosted task utilisation on CPU selection A boosted task needs to be scheduled on a CPU which can grant a minimum capacity which is higher than its utilization. However, a task can be allocated on a CPU which already provides an utilization which is higher than the task boosted utilization itself. Moreover, with the previous approach a task 100% boosted is not fitting any CPU. This patch makes use of the boosted task utilization just as a threashold which defines the minimum capacity should be available on a CPU to host that task. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a70c179bb465cc..f8e5668827643e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5462,6 +5462,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) struct sched_group *sg, *sg_target; int target_max_cap = INT_MAX; int target_cpu = task_cpu(p); + unsigned long task_util_boosted, new_util; int i; if (sysctl_sched_sync_hint_enable && sync) { @@ -5505,6 +5506,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) } } while (sg = sg->next, sg != sd->groups); + task_util_boosted = boosted_task_util(p); /* Find cpu with sufficient capacity */ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /* @@ -5512,8 +5514,13 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + boosted_task_util(p); + new_util = cpu_util(i) + task_util_boosted; + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ if (new_util > capacity_orig_of(i)) continue; From 76950b093e1b987cdd9f0d27cf079aad93474f77 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:45:57 +0100 Subject: [PATCH 233/420] FIX: sched/tune: move schedtune_nornalize_energy into fair.c The energy normalization function is required to get the proper values for the P-E space filtering function to work. That normalization is part of the hot wakeup path and currently implemented with a function call. Moving the normalization function into fair.c allows the compiler to further optimize that code by reducing overheads in the wakeup hot path. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++---------------- kernel/sched/tune.c | 42 +-------------- kernel/sched/tune.h | 13 ++++- 3 files changed, 91 insertions(+), 85 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8e5668827643e..f24b24bdb0d19f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4801,44 +4801,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -#ifdef CONFIG_SCHED_TUNE -static int energy_diff_evaluate(struct energy_env *eenv) -{ - unsigned int boost; - int nrg_delta; - - /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif - if (boost == 0) - return eenv->nrg.diff; - - /* Compute normalized energy diff */ - nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); - eenv->nrg.delta = nrg_delta; - - eenv->payoff = schedtune_accept_deltas( - eenv->nrg.delta, - eenv->cap.delta, - eenv->task); - - /* - * When SchedTune is enabled, the energy_diff() function will return - * the computed energy payoff value. Since the energy_diff() return - * value is expected to be negative by its callers, this evaluation - * function return a negative value each time the evaluation return a - * positive payoff, which is the condition for the acceptance of - * a scheduling decision - */ - return -eenv->payoff; -} -#else /* CONFIG_SCHED_TUNE */ -#define energy_diff_evaluate(eenv) eenv->nrg.diff -#endif - /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -4846,12 +4808,11 @@ static int energy_diff_evaluate(struct energy_env *eenv) * utilization is removed from or added to the system (e.g. task wake-up). If * both are specified, the utilization is migrated. */ -static int energy_diff(struct energy_env *eenv) +static inline int __energy_diff(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; - int result; struct energy_env eenv_before = { .util_delta = 0, @@ -4895,16 +4856,90 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - result = energy_diff_evaluate(eenv); - trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); - return result; + return eenv->nrg.diff; +} + +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; +#ifdef CONFIG_SCHED_DEBUG + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +static inline int +energy_diff(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; } +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 6edfd44de23d67..d5a003959f8d02 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -3,24 +3,17 @@ #include #include #include -#include #include #include #include #include "sched.h" +#include "tune.h" unsigned int sysctl_sched_cfs_boost __read_mostly; -/* - * System energy normalization constants - */ -static struct target_nrg { - unsigned long min_power; - unsigned long max_power; - struct reciprocal_value rdiv; -} schedtune_target_nrg; +extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; @@ -603,37 +596,6 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, return 0; } -/* - * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], - * corresponding to the specified energy variation. - */ -int -schedtune_normalize_energy(int energy_diff) -{ - u32 normalized_nrg; - int max_delta; - -#ifdef CONFIG_SCHED_DEBUG - /* Check for boundaries */ - max_delta = schedtune_target_nrg.max_power; - max_delta -= schedtune_target_nrg.min_power; - WARN_ON(abs(energy_diff) >= max_delta); -#endif - - /* Do scaling using positive numbers to increase the range */ - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; - - /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; - - /* Normalize on max energy for target platform */ - normalized_nrg = reciprocal_divide( - normalized_nrg, schedtune_target_nrg.rdiv); - - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; -} - #ifdef CONFIG_SCHED_DEBUG static void schedtune_test_nrg(unsigned long delta_pwr) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index da1f7b288aa09d..993c7d1a2888b7 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -1,6 +1,17 @@ #ifdef CONFIG_SCHED_TUNE +#include + +/* + * System energy normalization constants + */ +struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +}; + #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); @@ -9,7 +20,6 @@ int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); -int schedtune_normalize_energy(int energy); int schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task); @@ -25,7 +35,6 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) -#define schedtune_normalize_energy(energy) energy #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta #endif /* CONFIG_SCHED_TUNE */ From 9b24a3ded592a437189b8f2bf678413264211874 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 28 Jul 2016 17:28:55 +0100 Subject: [PATCH 234/420] sched/tune: Add support for negative boost values Change-Id: I164ee04ba98c3a776605f18cb65ee61b3e917939 Contains also: eas/stune: schedtune cpu boost_max must be non-negative. This is to avoid under-accounting cpu capacity which may cause task stacking and frequency spikes. Change-Id: Ie1c1cbd52a6edb77b4c15a830030aa748dff6f29 --- include/trace/events/sched.h | 20 +++++++++---------- kernel/sched/fair.c | 37 ++++++++++++++++++++---------------- kernel/sched/tune.c | 25 +++++++++++++++--------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 795f137a97787c..32668b096578c5 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -732,14 +732,14 @@ TRACE_EVENT(sched_tune_config, */ TRACE_EVENT(sched_boost_cpu, - TP_PROTO(int cpu, unsigned long util, unsigned long margin), + TP_PROTO(int cpu, unsigned long util, long margin), TP_ARGS(cpu, util, margin), TP_STRUCT__entry( __field( int, cpu ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field(long, margin ) ), TP_fast_assign( @@ -748,7 +748,7 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin = margin; ), - TP_printk("cpu=%d util=%lu margin=%lu", + TP_printk("cpu=%d util=%lu margin=%ld", __entry->cpu, __entry->util, __entry->margin) @@ -760,7 +760,7 @@ TRACE_EVENT(sched_boost_cpu, TRACE_EVENT(sched_tune_tasks_update, TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, - unsigned int boost, unsigned int max_boost), + int boost, int max_boost), TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), @@ -770,8 +770,8 @@ TRACE_EVENT(sched_tune_tasks_update, __field( int, cpu ) __field( int, tasks ) __field( int, idx ) - __field( unsigned int, boost ) - __field( unsigned int, max_boost ) + __field( int, boost ) + __field( int, max_boost ) ), TP_fast_assign( @@ -785,7 +785,7 @@ TRACE_EVENT(sched_tune_tasks_update, ), TP_printk("pid=%d comm=%s " - "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", __entry->pid, __entry->comm, __entry->cpu, __entry->tasks, __entry->idx, __entry->boost, __entry->max_boost) @@ -821,7 +821,7 @@ TRACE_EVENT(sched_tune_boostgroup_update, */ TRACE_EVENT(sched_boost_task, - TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + TP_PROTO(struct task_struct *tsk, unsigned long util, long margin), TP_ARGS(tsk, util, margin), @@ -829,7 +829,7 @@ TRACE_EVENT(sched_boost_task, __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field( long, margin ) ), @@ -840,7 +840,7 @@ TRACE_EVENT(sched_boost_task, __entry->margin = margin; ), - TP_printk("comm=%s pid=%d util=%lu margin=%lu", + TP_printk("comm=%s pid=%d util=%lu margin=%ld", __entry->comm, __entry->pid, __entry->util, __entry->margin) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f24b24bdb0d19f..78c8c74a1074d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5076,22 +5076,25 @@ static bool cpu_overutilized(int cpu) #ifdef CONFIG_SCHED_TUNE -static unsigned long -schedtune_margin(unsigned long signal, unsigned long boost) +static long +schedtune_margin(unsigned long signal, long boost) { - unsigned long long margin = 0; + long long margin = 0; /* * Signal proportional compensation (SPC) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S) + * M = B * (SCHED_LOAD_SCALE - S), if B is positive + * M = B * S, if B is negative * The obtained M could be used by the caller to "boost" S. */ - margin = SCHED_LOAD_SCALE - signal; - margin *= boost; - + if (boost >= 0) { + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; /* * Fast integer division by constant: * Constant : (C) = 100 @@ -5107,13 +5110,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) margin *= 1311; margin >>= 17; + if (boost < 0) + margin *= -1; return margin; } -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost; + int boost; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_cpu_boost(cpu); @@ -5126,12 +5131,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } -static inline unsigned long +static inline long schedtune_task_margin(struct task_struct *task) { - unsigned int boost; + int boost; unsigned long util; - unsigned long margin; + long margin; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_task_boost(task); @@ -5149,13 +5154,13 @@ schedtune_task_margin(struct task_struct *task) #else /* CONFIG_SCHED_TUNE */ -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } -static inline unsigned int +static inline int schedtune_task_margin(struct task_struct *task) { return 0; @@ -5167,7 +5172,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util, cpu); + long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); @@ -5178,7 +5183,7 @@ static inline unsigned long boosted_task_util(struct task_struct *task) { unsigned long util = task_util(task); - unsigned long margin = schedtune_task_margin(task); + long margin = schedtune_task_margin(task); trace_sched_boost_task(task, util, margin); diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d5a003959f8d02..8ad4c43988c6ee 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -213,10 +213,11 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ - unsigned boost_max; + bool idle; + int boost_max; struct { /* The boost for tasks on that boost group */ - unsigned boost; + int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; @@ -229,7 +230,7 @@ static void schedtune_cpu_update(int cpu) { struct boost_groups *bg; - unsigned boost_max; + int boost_max; int idx; bg = &per_cpu(cpu_boost_groups, cpu); @@ -243,9 +244,13 @@ schedtune_cpu_update(int cpu) */ if (bg->group[idx].tasks == 0) continue; + boost_max = max(boost_max, bg->group[idx].boost); } - + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); bg->boost_max = boost_max; } @@ -391,7 +396,7 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } -static u64 +static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); @@ -401,11 +406,13 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft) static int boost_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 boost) + s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; - if (boost < 0 || boost > 100) + if (boost < -100 || boost > 100) return -EINVAL; st->boost = boost; @@ -427,8 +434,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, static struct cftype files[] = { { .name = "boost", - .read_u64 = boost_read, - .write_u64 = boost_write, + .read_s64 = boost_read, + .write_s64 = boost_write, }, { } /* terminate */ }; From 64b9bac135bbb8a645300b497909b3489b3c9a08 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:38:25 +0100 Subject: [PATCH 235/420] FIXUP: sched/tune: fix payoff calculation for boost region The definition of the acceptance regions as well as the translation of these regions into a payoff value was both wrong which turned out in: a) a wrong definition of payoff for the performance boost region b) a correct "by chance" definition of the payoff for the performance constraint region (i.e. two sign errors together fixing the formula) This patch provides a better description of the cut regions as well as a fixed version of the payoff computations, which are now reduced to a single formula usable for both cases. Reported-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 77 +++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 8ad4c43988c6ee..e483cf33ac6a9f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -51,50 +51,51 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_boost_idx, int perf_constrain_idx) { int payoff = -INT_MAX; + int gain_idx = -1; /* Performance Boost (B) region */ - if (nrg_delta > 0 && cap_delta > 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta < cap_gain / nrg_gain - * which is: - * nrg_delta * cap_gain > cap_delta * nrg_gain - */ - payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; - payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_boost_idx].nrg_gain, - threshold_gains[perf_boost_idx].cap_gain, - payoff, 8); - - return payoff; - } - + if (nrg_delta >= 0 && cap_delta > 0) + gain_idx = perf_boost_idx; /* Performance Constraint (C) region */ - if (nrg_delta < 0 && cap_delta < 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta > cap_gain / nrg_gain - * which is: - * cap_delta * nrg_gain > nrg_delta * cap_gain - */ - payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; - payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_constrain_idx].nrg_gain, - threshold_gains[perf_constrain_idx].cap_gain, - payoff, 6); + else if (nrg_delta < 0 && cap_delta <= 0) + gain_idx = perf_constrain_idx; + /* Default: reject schedule candidate */ + if (gain_idx == -1) return payoff; - } - /* Default: reject schedule candidate */ + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * + * - Performance Boost (B) region + * + * Condition: nrg_delta > 0 && cap_delta > 0 + * Payoff criteria: + * cap_gain / nrg_gain < cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since both nrg_gain and nrg_delta are positive, the + * inequality does not change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * - Performance Constraint (C) region + * + * Condition: nrg_delta < 0 && cap_delta < 0 + * payoff criteria: + * cap_gain / nrg_gain > cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since nrg_gain > 0 while nrg_delta < 0, the + * inequality change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * This means that, in case of same positive defined {cap,nrg}_gain + * for both the B and C regions, we can use the same payoff formula + * where a positive value represents the accept condition. + */ + payoff = cap_delta * threshold_gains[gain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain; + return payoff; } From b12d081480a16c47025d969fc79463a2fd481eda Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:40:49 +0100 Subject: [PATCH 236/420] FIXUP: sched/tune: fix compilation error on !CONFIG_CGROUP_SCHEDTUNE When SchedTune is configured without CGroups support, the schedtune_accept_deltas() function is not declared. Since schedule candidate filtering is required also when global boosting mode is in use, this patch move the declaration of that function to make it dependant just on CONFIG_SCHED_TUNE. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 993c7d1a2888b7..8dedbba6b2669b 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -20,9 +20,6 @@ int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); -int schedtune_accept_deltas(int nrg_delta, int cap_delta, - struct task_struct *task); - #else /* CONFIG_CGROUP_SCHEDTUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) @@ -30,6 +27,9 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #endif /* CONFIG_CGROUP_SCHEDTUNE */ +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + #else /* CONFIG_SCHED_TUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) From b36b6784a92b2679e9d6057a90d64da92afd01ad Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:42:36 +0100 Subject: [PATCH 237/420] sched/{fair,tune}: simplify fair.c code The usage of conditional compiled code is discouraged in fair.c. This patch clean up a bit fair.c by moving schedtune_{cpu.task}_boost definitions into tune.h. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 21 +++------------------ kernel/sched/tune.h | 6 ++++++ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78c8c74a1074d9..a1a6dc3ca35cea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4903,18 +4903,13 @@ normalize_energy(int energy_diff) static inline int energy_diff(struct energy_env *eenv) { - unsigned int boost; + int boost = schedtune_task_boost(eenv->task); int nrg_delta; /* Conpute "absolute" energy diff */ __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return eenv->nrg.diff; @@ -5118,13 +5113,8 @@ schedtune_margin(unsigned long signal, long boost) static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - int boost; + int boost = schedtune_cpu_boost(cpu); -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_cpu_boost(cpu); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; @@ -5134,15 +5124,10 @@ schedtune_cpu_margin(unsigned long util, int cpu) static inline long schedtune_task_margin(struct task_struct *task) { - int boost; + int boost = schedtune_task_boost(task); unsigned long util; long margin; -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 8dedbba6b2669b..7499b7e7ae6951 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -22,6 +22,9 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #else /* CONFIG_CGROUP_SCHEDTUNE */ +#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() +#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -32,6 +35,9 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #else /* CONFIG_SCHED_TUNE */ +#define schedtune_cpu_boost(cpu) 0 +#define schedtune_task_boost(tsk) 0 + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From f9b2124087197f4b8c3749035cb4d223c7c39007 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:19:41 +0100 Subject: [PATCH 238/420] sched/tune: use a single initialisation function With the introduction of initialization function required to compute the energy normalization constants from DTB at boot time, we have now a late_initcall which is already used by SchedTune. This patch consolidate within that function the other initialization bits which was previously deferred to the first CGroup creation. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index e483cf33ac6a9f..51d6a8e8189994 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -410,8 +410,6 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); - unsigned threshold_idx; - int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; @@ -460,33 +458,14 @@ schedtune_boostgroup_init(struct schedtune *st) return 0; } -static int -schedtune_init(void) -{ - struct boost_groups *bg; - int cpu; - - /* Initialize the per CPU boost groups */ - for_each_possible_cpu(cpu) { - bg = &per_cpu(cpu_boost_groups, cpu); - memset(bg, 0, sizeof(struct boost_groups)); - } - - pr_info(" schedtune configured to support %d boost groups\n", - BOOSTGROUPS_COUNT); - return 0; -} - static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { struct schedtune *st; int idx; - if (!parent_css) { - schedtune_init(); + if (!parent_css) return &root_schedtune.css; - } /* Allow only single level hierachies */ if (parent_css != &root_schedtune.css) { @@ -559,6 +538,22 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +static inline void +schedtune_init_cgroups(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info("schedtune: configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); +} + #else /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -706,7 +701,7 @@ schedtune_add_cluster_nrg( * that bind the EM to the topology information. */ static int -schedtune_init_late(void) +schedtune_init(void) { struct target_nrg *ste = &schedtune_target_nrg; unsigned long delta_pwr = 0; @@ -746,11 +741,18 @@ schedtune_init_late(void) ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); schedtune_test_nrg(delta_pwr); + +#ifdef CONFIG_CGROUP_SCHEDTUNE + schedtune_init_cgroups(); +#else + pr_info("schedtune: configured to support global boosting only\n"); +#endif + return 0; nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init_late); +late_initcall(schedtune_init); From e7ce26f0c8064d970d49acc1dfb8c24504300008 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 18:44:40 +0100 Subject: [PATCH 239/420] FIXUP: sched/tune: fix accounting for runnable tasks Contains: sched/tune: fix accounting for runnable tasks (1/5) The accounting for tasks into boost groups of different CPUs is currently broken mainly because: a) we do not properly track the change of boost group of a RUNNABLE task b) there are race conditions between migration code and accounting code This patch provides a fixes to ensure enqueue/dequeue accounting also for throttled tasks. Without this patch is can happen that a task is enqueued into a throttled RQ thus not being accounted for the boosting of the corresponding RQ. We could argue that a throttled task should not boost a CPU, however: a) properly implementing CPU boosting considering throttled tasks will increase a lot the complexity of the solution b) it's not easy to quantify the benefits introduced by such a more complex solution Since task throttling requires the usage of the CFS bandwidth controller, which is not widely used on mobile systems (at least not by Android kernels so far), for the time being we go for the simple solution and boost also for throttled RQs. sched/tune: fix accounting for runnable tasks (2/5) This patch provides the code required to enforce proper locking. A per boost group spinlock has been added to grant atomic accounting of tasks as well as to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. sched/tune: fix accounting for runnable tasks (3/5) This patch adds cgroups {allow,can,cancel}_attach callbacks. Since a task can be migrated between boost groups while it's running, the CGroups's attach callbacks have been added to properly migrate boost contributions of RUNNABLE tasks. The RQ's lock is used to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. While the SchedTune's CPU lock is used to grant atrocity of the accounting within the CPU. NOTE: the current implementation does not allows a concurrent CPU migration and CGroups change. sched/tune: fix accounting for runnable tasks (4/5) This fixes accounting for exiting tasks by adding a dedicated call early in the do_exit() syscall, which disables SchedTune accounting as soon as a task is flagged PF_EXITING. This flag is set before the multiple dequeue/enqueue dance triggered by cgroup_exit() which is useful only to inject useless tasks movements thus increasing possibilities for race conditions with the migration code. The schedtune_exit_task() call does the last dequeue of a task from its current boost group. This is a solution more aligned with what happens in mainline kernels (>v4.4) where the exit_cgroup does not move anymore a dying task to the root control group. sched/tune: fix accounting for runnable tasks (5/5) To avoid accounting issues at startup, this patch disable the SchedTune accounting until the required data structures have been properly initialized. Signed-off-by: Patrick Bellasi --- kernel/exit.c | 5 ++ kernel/sched/core.c | 12 +++ kernel/sched/fair.c | 11 ++- kernel/sched/sched.h | 3 + kernel/sched/tune.c | 179 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/tune.h | 6 ++ 6 files changed, 194 insertions(+), 22 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff953cf..582e70f35eeca3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,8 @@ #include #include +#include "sched/tune.h" + #include #include #include @@ -713,6 +715,9 @@ void do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + schedtune_exit_task(tsk); + /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97a6d57519f973..6fd35aad83f63b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -349,6 +349,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +struct rq * +lock_rq_of(struct task_struct *p, unsigned long *flags) +{ + return task_rq_lock(p, flags); +} + static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { @@ -364,6 +370,12 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } +void +unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags) +{ + task_rq_unlock(rq, p, flags); +} + /* * this_rq_lock - lock this runqueue and disable interrupts. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a1a6dc3ca35cea..c5fd19e4c30390 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,8 +4065,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; - schedtune_enqueue_task(p, cpu_of(rq)); - /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4077,6 +4075,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } + + /* Update SchedTune accouting */ + schedtune_enqueue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -4142,7 +4144,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { - schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -4160,6 +4161,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } + /* Update SchedTune accouting */ + schedtune_dequeue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5430,7 +5434,6 @@ static inline int find_best_target(struct task_struct *p) * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5388278f048ff3..9d16a5e9a5f654 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1561,6 +1561,9 @@ static inline void sched_avg_update(struct rq *rq) { } extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); +extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags); +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags); + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 51d6a8e8189994..07fca38066c5b6 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -11,6 +11,10 @@ #include "sched.h" #include "tune.h" +#ifdef CONFIG_CGROUP_SCHEDTUNE +static bool schedtune_initialized = false; +#endif + unsigned int sysctl_sched_cfs_boost __read_mostly; extern struct target_nrg schedtune_target_nrg; @@ -222,6 +226,8 @@ struct boost_groups { /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ @@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { - struct boost_groups *bg; - int tasks; - - bg = &per_cpu(cpu_boost_groups, cpu); + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; /* Update boosted tasks count while avoiding to make it negative */ - if (task_count < 0 && bg->group[idx].tasks <= -task_count) - bg->group[idx].tasks = 0; - else - bg->group[idx].tasks += task_count; - - /* Boost group activation or deactivation on that RQ */ - tasks = bg->group[idx].tasks; - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + bg->group[idx].tasks = max(0, tasks); trace_sched_tune_tasks_update(p, cpu, tasks, idx, bg->group[idx].boost, bg->boost_max); + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); } /* @@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. @@ -339,13 +346,110 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_allow_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + /* We always allows tasks to be moved between existing CGroups */ + return 0; +} + +int schedtune_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct boost_groups *bg; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + cgroup_taskset_for_each(task, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = lock_rq_of(task, &irq_flags); + + if (!task->on_rq) { + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} - schedtune_tasks_update(p, cpu, idx, 1); +void schedtune_cancel_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); } /* @@ -353,26 +457,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. - * The last dequeue will be done by cgroup exit() callback. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). */ if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct schedtune *st; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; - schedtune_tasks_update(p, cpu, idx, -1); + rq = lock_rq_of(tsk, &irq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + unlock_rq_of(rq, tsk, &irq_flags); } int schedtune_cpu_boost(int cpu) @@ -534,6 +674,9 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, .exit = schedtune_exit, + .allow_attach = schedtune_allow_attach, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 7499b7e7ae6951..cac1e9a0d26184 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +void schedtune_exit_task(struct task_struct *tsk); + void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); @@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -38,6 +42,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_cpu_boost(cpu) 0 #define schedtune_task_boost(tsk) 0 +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 7fc270f6221a9cb78e67efed6bded15acb2d3bd0 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 29 Jul 2016 14:41:25 +0100 Subject: [PATCH 240/420] sched/fair: optimize idle cpu selection for boosted tasks find_best_target CPU selection is biased towards lower CPU IDs. Bias towards higher CPUs for boosted tasks. For boosted tasks unconditionally use the idle CPU returned by find_best_target. BUG: 29512132 Change-Id: I3d650051752163fcf3dc7909751d1fde3f9d17c0 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 70 +++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c5fd19e4c30390..675dbd98fc361b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5395,32 +5395,30 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } -static inline int find_best_target(struct task_struct *p) +static inline int find_best_target(struct task_struct *p, bool boosted) { - int i, boosted; + int iter_cpu; int target_cpu = -1; int target_capacity = 0; int backup_capacity = 0; - int idle_cpu = -1; + int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; unsigned long task_util_boosted, new_util; - /* - * Favor 1) busy cpu with most capacity at current OPP - * 2) idle_cpu with capacity at current OPP - * 3) busy cpu with capacity at higher OPP - */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boosted = schedtune_task_boost(p); -#else - boosted = 0; -#endif task_util_boosted = boosted_task_util(p); - for_each_cpu(i, tsk_cpus_allowed(p)) { - int cur_capacity = capacity_curr_of(i); - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + struct rq *rq; + int idle_idx; + + /* + * favor higher cpus for boosted tasks + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) + continue; /* * p's blocked utilization is still accounted for on prev_cpu @@ -5441,46 +5439,43 @@ static inline int find_best_target(struct task_struct *p) * For boosted tasks we favor idle cpus unconditionally to * improve latency. */ - if (idle_idx >= 0 && boosted) { - if (idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - idle_cpu = i; - } + if (idle_cpu(i) && boosted) { + if (best_idle_cpu < 0) + best_idle_cpu = i; continue; } + cur_capacity = capacity_curr_of(i); + rq = cpu_rq(i); + idle_idx = idle_get_state_idx(rq); + if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { if (target_capacity == 0 || target_capacity > cur_capacity) { - /* busy CPU with most capacity at current OPP */ target_cpu = i; target_capacity = cur_capacity; } } else if (!boosted) { - if (idle_cpu < 0 || + if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { best_idle_cstate = idle_idx; - idle_cpu = i; + best_idle_cpu = i; } } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { - /* first busy CPU with capacity at higher OPP */ backup_capacity = cur_capacity; backup_cpu = i; } } - if (!boosted && target_cpu < 0) { - target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; - } + if (boosted && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + else if (target_cpu < 0) + target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; - if (boosted && idle_cpu >= 0) - target_cpu = idle_cpu; return target_cpu; } @@ -5566,9 +5561,16 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) /* * Find a cpu with sufficient capacity */ - int tmp_target = find_best_target(p); +#ifdef CONFIG_CGROUP_SCHEDTUNE + bool boosted = schedtune_task_boost(p) > 0; +#else + bool boosted = 0; +#endif + int tmp_target = find_best_target(p, boosted); if (tmp_target >= 0) target_cpu = tmp_target; + if (boosted && idle_cpu(target_cpu)) + return target_cpu; } if (target_cpu != task_cpu(p)) { From f30530caf33c5993375dd8edd6a6548e3215fa32 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:32:26 +0100 Subject: [PATCH 241/420] sched/tune: fix PB and PC cuts indexes definition The current definition of the Performance Boost (PB) and Performance Constraint (PC) regions is has two main issues: 1) in the computation of the boost index we overflow the thresholds_gains table for boost=100 2) the two cuts had _NOT_ the same ratio The last point means that when boost=0 we do _not_ have a "standard" EAS behaviour, i.e. accepting all candidate which decrease energy regardless of their impact on performances. Instead, we accept only schedule candidate which are in the Optimal region, i.e. decrease energy while increasing performances. This behaviour can have a negative impact also on CPU selection policies which tries to spread tasks to reduce latencies. Indeed, for example we could end up rejecting a schedule candidate which want to move a task from a congested CPU to an idle one while, specifically in the case where the target CPU will be running on a lower OPP. This patch fixes these two issues by properly clamping the boost value in the appropriate range to compute the threshold indexes as well as by using the same threshold index for both cuts. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: fix update of threshold index for boost groups When SchedTune is configured to work with CGroup mode, each time we update the boost value of a group we do not update the threshed indexes for the definition of the Performance Boost (PC) and Performance Constraint (PC) region. This means that while the OPP boosting and CPU biasing selection is working as expected, the __schedtune_accept_deltas function is always using the initial values for these cuts. This patch ensure that each time a new boost value is configured for a boost group, the cuts for the PB and PC region are properly updated too. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: update PC and PB cuts definition The current definition of Performance Boost (PB) and Performance Constraint (PC) cuts defines two "dead regions": - up to 20% boost: we are in energy-reduction only mode, i.e. accept all candidate which reduce energy - over 70% boost: we are in performance-increase only mode, i.e. accept only sched candidate which do not reduce performances This patch uses a more fine grained configuration where these two "dead regions" are reduced to: up to 10% and over 90%. This should allow to have some boosting benefits starting from 10% boost values as well as not being to much permissive starting from boost values of 80%. Suggested-by: Leo Yan Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan bug: 28312446 Change-Id: Ia326c66521e38c98e7a7eddbbb7c437875efa1ba Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 07fca38066c5b6..1eda155f7497d2 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -38,16 +38,16 @@ struct threshold_params { */ static struct threshold_params threshold_gains[] = { - { 0, 4 }, /* >= 0% */ - { 0, 4 }, /* >= 10% */ - { 1, 4 }, /* >= 20% */ - { 2, 4 }, /* >= 30% */ - { 3, 4 }, /* >= 40% */ - { 4, 3 }, /* >= 50% */ - { 4, 2 }, /* >= 60% */ - { 4, 1 }, /* >= 70% */ - { 4, 0 }, /* >= 80% */ - { 4, 0 } /* >= 90% */ + { 0, 5 }, /* < 10% */ + { 1, 5 }, /* < 20% */ + { 2, 5 }, /* < 30% */ + { 3, 5 }, /* < 40% */ + { 4, 5 }, /* < 50% */ + { 5, 4 }, /* < 60% */ + { 5, 3 }, /* < 70% */ + { 5, 2 }, /* < 80% */ + { 5, 1 }, /* < 90% */ + { 5, 0 } /* <= 100% */ }; static int @@ -550,13 +550,29 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; + boost_pct = boost; + + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + st->perf_boost_idx = threshold_idx; + st->perf_constrain_idx = threshold_idx; st->boost = boost; - if (css == &root_schedtune.css) + if (css == &root_schedtune.css) { sysctl_sched_cfs_boost = boost; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; + } /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); @@ -727,17 +743,25 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + unsigned threshold_idx; + int boost_pct; if (ret || !write) return ret; - /* Performance Boost (B) region threshold params */ - perf_boost_idx = sysctl_sched_cfs_boost; - perf_boost_idx /= 10; + if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100) + return -EINVAL; + boost_pct = sysctl_sched_cfs_boost; - /* Performance Constraint (C) region threshold params */ - perf_constrain_idx = 100 - sysctl_sched_cfs_boost; - perf_constrain_idx /= 10; + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; return 0; } From d34b01b50a147c670eec8f569a669cfb4acece09 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 31 May 2016 09:08:38 -0700 Subject: [PATCH 242/420] sched: Introduce Window Assisted Load Tracking (WALT) use a window based view of time in order to track task demand and CPU utilization in the scheduler. Window Assisted Load Tracking (WALT) implementation credits: Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla and Todd Kjos Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708 Includes fixes for issues: eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a race resulting in watchdog resets BUG: 29353986 Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb Handle walt accounting anomoly during resume During resume, there is a corner case where on wakeup, a task's prev_runnable_sum can go negative. This is a workaround that fixes the condition and warns (instead of crashing). BUG: 29464099 Change-Id: I173e7874324b31a3584435530281708145773508 Signed-off-by: Todd Kjos Signed-off-by: Srinath Sridharan Signed-off-by: Juri Lelli --- include/linux/sched.h | 53 ++ include/linux/sched/sysctl.h | 5 + include/trace/events/sched.h | 149 +++++ init/Kconfig | 9 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 43 +- kernel/sched/fair.c | 20 + kernel/sched/rt.c | 4 + kernel/sched/sched.h | 34 ++ kernel/sched/stop_task.c | 3 + kernel/sched/walt.c | 1098 ++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 57 ++ kernel/sysctl.c | 23 + 13 files changed, 1498 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/walt.c create mode 100644 kernel/sched/walt.h diff --git a/include/linux/sched.h b/include/linux/sched.h index 35268290a7745c..c8ab0f711896d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -275,6 +275,15 @@ extern char ___assert_task_state[1 - 2*!!( /* Task command name length */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + #include /* @@ -1165,6 +1174,41 @@ struct sched_statistics { }; #endif +#ifdef CONFIG_SCHED_WALT +#define RAVG_HIST_SIZE_MAX 5 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window' represents task's contribution to cpu busy time + * statistics (rq->curr_runnable_sum) in current window + * + * 'prev_window' represents task's contribution to cpu busy time + * statistics (rq->prev_runnable_sum) in previous window + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 curr_window, prev_window; + u16 active_windows; +}; +#endif + struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; @@ -1301,6 +1345,15 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4321305cb84c34..0f76a6a26f7932 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 32668b096578c5..36477b7c1ecc63 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -938,6 +938,155 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#ifdef CONFIG_SCHED_WALT +struct rq; + +TRACE_EVENT(walt_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, int evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) + __field( u64, wallclock ) + __field( u64, mark_start ) + __field( u64, delta_m ) + __field( u64, win_start ) + __field( u64, delta ) + __field( u64, irqtime ) + __field( int, evt ) + __field(unsigned int, demand ) + __field(unsigned int, sum ) + __field( int, cpu ) + __field( u64, cs ) + __field( u64, ps ) + __field( u32, curr_window ) + __field( u32, prev_window ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" + " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" + , __entry->wallclock, __entry->win_start, __entry->delta, + __entry->evt, __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->cs, __entry->ps, + __entry->curr_window, __entry->prev_window, + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows + ) +); + +TRACE_EVENT(walt_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + int evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, runtime ) + __field( int, samples ) + __field( int, evt ) + __field( u64, demand ) + __field(unsigned int, walt_avg ) + __field(unsigned int, pelt_avg ) + __array( u32, hist, RAVG_HIST_SIZE_MAX) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->pelt_avg = p->se.avg.util_avg; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %d demand %llu" + " walt %u pelt %u (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, __entry->evt, + __entry->demand, + __entry->walt_avg, + __entry->pelt_avg, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(walt_migration_update_sum, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __field(int, cpu ) + __field(int, pid ) + __field( u64, cs ) + __field( u64, ps ) + __field( s64, nt_cs ) + __field( s64, nt_ps ) + ), + + TP_fast_assign( + __entry->cpu = cpu_of(rq); + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->nt_cs = (s64)rq->nt_curr_runnable_sum; + __entry->nt_ps = (s64)rq->nt_prev_runnable_sum; + __entry->pid = p->pid; + ), + + TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d", + __entry->cpu, __entry->cs, __entry->ps, + __entry->nt_cs, __entry->nt_ps, __entry->pid) +); +#endif /* CONFIG_SCHED_WALT */ + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/init/Kconfig b/init/Kconfig index a362ebfe4135fd..e9e50041bc358c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING endchoice +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" help diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index dbf2b26e1c2650..fdcdb2235274de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6fd35aad83f63b..7798832150324e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -89,6 +89,7 @@ #define CREATE_TRACE_POINTS #include +#include "walt.h" void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { @@ -1089,7 +1090,9 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu) dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); + double_unlock_balance(rq, cpu_rq(new_cpu)); raw_spin_unlock(&rq->lock); rq = cpu_rq(new_cpu); @@ -1286,6 +1289,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + + walt_fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1887,6 +1892,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; +#ifdef CONFIG_SMP + struct rq *rq; + u64 wallclock; +#endif /* * If we are going to wake up a thread waiting for CONDITION we @@ -1917,6 +1926,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + rq = cpu_rq(task_cpu(p)); + + raw_spin_lock(&rq->lock); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + raw_spin_unlock(&rq->lock); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -1924,10 +1941,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); @@ -1966,8 +1985,13 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + u64 wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); @@ -2034,6 +2058,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -2332,6 +2357,9 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + + walt_init_new_task_load(p); + /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2344,6 +2372,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); + walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); @@ -2817,9 +2846,12 @@ void scheduler_tick(void) sched_clock_tick(); raw_spin_lock(&rq->lock); + walt_set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); @@ -3058,6 +3090,7 @@ static void __sched __schedule(void) unsigned long *switch_count; struct rq *rq; int cpu; + u64 wallclock; need_resched: preempt_disable(); @@ -3107,6 +3140,9 @@ static void __sched __schedule(void) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->skip_clock_update = 0; @@ -5401,6 +5437,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; break; @@ -5420,6 +5459,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -7202,6 +7242,7 @@ void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; + walt_init_cpu_efficiency(); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 675dbd98fc361b..8d7ef4cd3951f4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,11 +30,13 @@ #include #include #include +#include #include #include "sched.h" #include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -56,6 +58,10 @@ unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -4040,6 +4046,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4047,6 +4054,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4061,6 +4069,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4110,6 +4119,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4130,6 +4140,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4144,6 +4155,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_dec_cumulative_runnable_avg(rq, p); /* * We want to potentially trigger a freq switch @@ -5033,6 +5045,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif return p->se.avg.util_avg; } @@ -6450,7 +6468,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 30d757853a535e..b33d7bad868610 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include +#include "walt.h" + int sched_rr_timeslice = RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -1243,6 +1245,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1254,6 +1257,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9d16a5e9a5f654..0684bcb9b634e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -398,6 +398,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; @@ -639,6 +643,27 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_WALT + /* + * max_freq = user or thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq, max_possible_freq; + struct cpumask freq_domain_cpumask; + + u64 cumulative_runnable_avg; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + int capacity; + int max_possible_capacity; + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +#endif /* CONFIG_SCHED_WALT */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1453,6 +1478,10 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_ravg_window; +extern unsigned int walt_disabled; + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -1484,6 +1513,11 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / + walt_ravg_window; +#endif delta += util; if (delta < 0) return 0; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 79ffec45a6acd9..65e8c17cc21981 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -42,12 +43,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 00000000000000..1dff3d2e2358dc --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1098 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * Window Assisted Load Tracking (WALT) implementation credits: + * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, + * Pavan Kumar Kondeti, Olav Haugan + * + * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla + * and Todd Kjos + */ + +#include +#include +#include +#include "sched.h" +#include "walt.h" + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +static __read_mostly unsigned int walt_ravg_hist_size = 5; +static __read_mostly unsigned int walt_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +static __read_mostly unsigned int walt_account_wait_time = 1; +static __read_mostly unsigned int walt_freq_account_wait_time = 0; +static __read_mostly unsigned int walt_io_is_busy = 0; + +unsigned int sysctl_sched_walt_init_task_load_pct = 15; + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +static unsigned int max_possible_efficiency = 1024; +static unsigned int min_possible_efficiency = 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +static unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +static unsigned int min_max_freq = 1; + +static unsigned int max_capacity = 1024; +static unsigned int min_capacity = 1024; +static unsigned int max_load_scale_factor = 1024; +static unsigned int max_possible_capacity = 1024; + +/* Mask of all CPUs that have max_possible_capacity */ +static cpumask_t mpc_mask = CPU_MASK_ALL; + +/* Window size (in ns) */ +__read_mostly unsigned int walt_ravg_window = 20000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +static unsigned int sync_cpu; +static ktime_t ktime_last; +static bool walt_ktime_suspended; + +static unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +void +walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg += p->ravg.demand; +} + +void +walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg -= p->ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); +} + +static void +fixup_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p, s64 task_load_delta) +{ + rq->cumulative_runnable_avg += task_load_delta; + if ((s64)rq->cumulative_runnable_avg < 0) + panic("cra less than zero: tld: %lld, task_load(p) = %u\n", + task_load_delta, task_load(p)); +} + +u64 walt_ktime_clock(void) +{ + if (unlikely(walt_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void walt_resume(void) +{ + walt_ktime_suspended = false; +} + +static int walt_suspend(void) +{ + ktime_last = ktime_get(); + walt_ktime_suspended = true; + return 0; +} + +static struct syscore_ops walt_syscore_ops = { + .resume = walt_resume, + .suspend = walt_suspend +}; + +static int __init walt_init_ops(void) +{ + register_syscore_ops(&walt_syscore_ops); + return 0; +} +late_initcall(walt_init_ops); + +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg += p->ravg.demand; +} + +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg -= p->ravg.demand; +} + +static int exiting_task(struct task_struct *p) +{ + if (p->flags & PF_EXITING) { + if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) { + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + } + return 1; + } + return 0; +} + +static int __init set_walt_ravg_window(char *str) +{ + get_option(&str, &walt_ravg_window); + + walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + return 0; +} + +early_param("walt_ravg_window", set_walt_ravg_window); + +static void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < walt_ravg_window) + return; + + nr_windows = div64_u64(delta, walt_ravg_window); + rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; +} + +static u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned int cur_freq = rq->cur_freq; + int sf; + + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = rq->max_possible_freq; + + /* round up div64 */ + delta = div64_u64(delta * cur_freq + max_possible_freq - 1, + max_possible_freq); + + sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); + + delta *= sf; + delta >>= 10; + + return delta; +} + +static int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!walt_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE || + event == TASK_UPDATE) + return 1; + + /* Only TASK_MIGRATE && PICK_NEXT_TASK left */ + return walt_freq_account_wait_time; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, nr_full_windows = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = walt_ravg_window; + u64 delta; + + new_window = mark_start < window_start; + if (new_window) { + nr_full_windows = div64_u64((window_start - mark_start), + window_size); + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + /* Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!nr_full_windows) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO */ + + if (!new_window) + return; + + /* A new window has started. The RQ demand must be rolled + * over if p is the current task. */ + if (p_is_curr_task) { + u64 prev_sum = 0; + + /* p is either idle task or an exiting task */ + if (!nr_full_windows) { + prev_sum = rq->curr_runnable_sum; + } + + rq->prev_runnable_sum = prev_sum; + rq->curr_runnable_sum = 0; + } + + return; + } + + if (!new_window) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + rq->curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + rq->prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum += delta; + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + + delta += rq->curr_runnable_sum; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + + } + /* + * Rollover for normal runnable sum is done here by overwriting + * the values in prev_runnable_sum and curr_runnable_sum. + * Rollover for new task runnable sum has completed by previous + * if-else statement. + */ + rq->prev_runnable_sum = delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum = delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. */ + rq->prev_runnable_sum = rq->curr_runnable_sum; + if (mark_start > window_start) { + rq->curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + rq->prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* No need to bother updating task demand for exiting tasks + * or the idle task. */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. */ + if (event == TASK_WAKE || (!walt_account_wait_time && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = walt_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (walt_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (walt_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, walt_ravg_hist_size); + if (walt_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + fixup_cumulative_runnable_avg(rq, p, demand); + + p->ravg.demand = demand; + +done: + trace_walt_update_history(rq, p, runtime, samples, event); + return; +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, + u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > walt_ravg_window)) + p->ravg.sum = walt_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = walt_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* The simple case - busy time contained within the existing + * window. */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* Roll window_start back to current to process any remainder + * in current window. */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + if (walt_disabled || !rq->window_start) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + +done: + trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime); + + p->ravg.mark_start = wallclock; +} + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +void walt_init_cpu_efficiency(void) +{ + int i, efficiency; + unsigned int max = 0, min = UINT_MAX; + + for_each_possible_cpu(i) { + efficiency = arch_get_cpu_efficiency(i); + cpu_rq(i)->efficiency = efficiency; + + if (efficiency > max) + max = efficiency; + if (efficiency < min) + min = efficiency; + } + + if (max) + max_possible_efficiency = max; + + if (min) + min_possible_efficiency = min; +} + +static void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void walt_mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start) { + reset_task_stats(p); + return; + } + + wallclock = walt_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +void walt_set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start) + return; + + if (cpu == sync_cpu) { + rq->window_start = walt_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +void walt_migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +void walt_fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + + if (p->ravg.curr_window) { + src_rq->curr_runnable_sum -= p->ravg.curr_window; + dest_rq->curr_runnable_sum += p->ravg.curr_window; + } + + if (p->ravg.prev_window) { + src_rq->prev_runnable_sum -= p->ravg.prev_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + } + + if ((s64)src_rq->prev_runnable_sum < 0) { + src_rq->prev_runnable_sum = 0; + WARN_ON(1); + } + if ((s64)src_rq->curr_runnable_sum < 0) { + src_rq->curr_runnable_sum = 0; + WARN_ON(1); + } + + trace_walt_migration_update_sum(src_rq, p); + trace_walt_migration_update_sum(dest_rq, p); + + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max = 0, min = INT_MAX; + + for_each_online_cpu(i) { + if (cpu_rq(i)->capacity > max) + max = cpu_rq(i)->capacity; + if (cpu_rq(i)->capacity < min) + min = cpu_rq(i)->capacity; + } + + max_capacity = max; + min_capacity = min; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long capacity_scale_cpu_efficiency(int cpu) +{ + return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(int cpu) +{ + return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static unsigned long load_scale_cpu_efficiency(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cpu_rq(cpu)->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static unsigned long load_scale_cpu_freq(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); +} + +static int compute_capacity(int cpu) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cpu); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cpu); + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(int cpu) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cpu); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cpu); + load_scale >>= 10; + + return load_scale; +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + int i, update_max = 0; + u64 highest_mpc = 0, highest_mplsf = 0; + const struct cpumask *cpus = policy->related_cpus; + unsigned int orig_min_max_freq = min_max_freq; + unsigned int orig_max_possible_freq = max_possible_freq; + /* Initialized to policy->max in case policy->related_cpus is empty! */ + unsigned int orig_max_freq = policy->max; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + for_each_cpu(i, policy->related_cpus) { + cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, + policy->related_cpus); + orig_max_freq = cpu_rq(i)->max_freq; + cpu_rq(i)->min_freq = policy->min; + cpu_rq(i)->max_freq = policy->max; + cpu_rq(i)->cur_freq = policy->cur; + cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + /* Changes to policy other than max_freq don't require any updates */ + if (orig_max_freq == policy->max) + return 0; + + /* + * A changed min_max_freq or max_possible_freq (possible during bootup) + * needs to trigger re-computation of load_scale_factor and capacity for + * all possible cpus (even those offline). It also needs to trigger + * re-computation of nr_big_task count on all online cpus. + * + * A changed rq->max_freq otoh needs to trigger re-computation of + * load_scale_factor and capacity for just the cluster of cpus involved. + * Since small task definition depends on max_load_scale_factor, a + * changed load_scale_factor of one cluster could influence + * classification of tasks in another cluster. Hence a changed + * rq->max_freq will need to trigger re-computation of nr_big_task + * count on all online cpus. + * + * While it should be sufficient for nr_big_tasks to be + * re-computed for only online cpus, we have inadequate context + * information here (in policy notifier) with regard to hotplug-safety + * context in which notification is issued. As a result, we can't use + * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is + * fixed up to issue notification always in hotplug-safe context, + * re-compute nr_big_task for all possible cpus. + */ + + if (orig_min_max_freq != min_max_freq || + orig_max_possible_freq != max_possible_freq) { + cpus = cpu_possible_mask; + update_max = 1; + } + + /* + * Changed load_scale_factor can trigger reclassification of tasks as + * big or small. Make this change "atomic" so that tasks are accounted + * properly due to changed load_scale_factor + */ + for_each_cpu(i, cpus) { + struct rq *rq = cpu_rq(i); + + rq->capacity = compute_capacity(i); + rq->load_scale_factor = compute_load_scale_factor(i); + + if (update_max) { + u64 mpc, mplsf; + + mpc = div_u64(((u64) rq->capacity) * + rq->max_possible_freq, rq->max_freq); + rq->max_possible_capacity = (int) mpc; + + mplsf = div_u64(((u64) rq->load_scale_factor) * + rq->max_possible_freq, rq->max_freq); + + if (mpc > highest_mpc) { + highest_mpc = mpc; + cpumask_clear(&mpc_mask); + cpumask_set_cpu(i, &mpc_mask); + } else if (mpc == highest_mpc) { + cpumask_set_cpu(i, &mpc_mask); + } + + if (mplsf > highest_mplsf) + highest_mplsf = mplsf; + } + } + + if (update_max) { + max_possible_capacity = highest_mpc; + max_load_scale_factor = highest_mplsf; + } + + __update_min_max_capacity(); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + int i; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_rq(cpu)->cur_freq == new_freq) + return 0; + + for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { + struct rq *rq = cpu_rq(i); + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); + rq->cur_freq = new_freq; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +void walt_init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = + div64_u64((u64)sysctl_sched_walt_init_task_load_pct * + (u64)walt_ravg_window, 100); + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)walt_ravg_window, 100); + } + + p->ravg.demand = init_load_windows; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 00000000000000..cabc193a683d56 --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_fixup_busy_time(struct task_struct *p, int new_cpu); +void walt_init_new_task_load(struct task_struct *p); +void walt_mark_task_starting(struct task_struct *p); +void walt_set_window_start(struct rq *rq); +void walt_migrate_sync_cpu(int cpu); +void walt_init_cpu_efficiency(void); +u64 walt_ktime_clock(void); + +#else /* CONFIG_SCHED_WALT */ + +static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void walt_init_new_task_load(struct task_struct *p) { } +static inline void walt_mark_task_starting(struct task_struct *p) { } +static inline void walt_set_window_start(struct rq *rq) { } +static inline void walt_migrate_sync_cpu(int cpu) { } +static inline void walt_init_cpu_efficiency(void) { } +static inline u64 walt_ktime_clock(void) { return 0; } + +#endif /* CONFIG_SCHED_WALT */ + +extern unsigned int walt_disabled; + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 84be91491c0c7e..1bb0cd6bf6bf17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -312,6 +312,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "sched_sync_hint_enable", .data = &sysctl_sched_sync_hint_enable, From a5ced50ce21ef5aafeade6fc3d42d774696ee996 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 22 Jul 2016 13:21:15 +0100 Subject: [PATCH 243/420] sched/walt: Accounting for number of irqs pending on each core Schedules on a core whose irq count is less than a threshold. Improves I/O performance of EAS. Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5 --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 5 +++ kernel/sched/cputime.c | 16 +++++++++ kernel/sched/fair.c | 7 +++- kernel/sched/sched.h | 3 ++ kernel/sched/walt.c | 65 ++++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 5 +++ kernel/sysctl.c | 7 ++++ 8 files changed, 108 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0f76a6a26f7932..b46d647e8f0569 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware; extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_walt_cpu_high_irqload; #endif enum sched_tunable_scaling { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7798832150324e..8f071ed51ff361 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7429,6 +7429,11 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bc285836e44c72..fbe14f97ca79c3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -6,6 +6,7 @@ #include #include #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -50,6 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; @@ -57,6 +62,9 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -71,8 +79,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); +#ifdef CONFIG_SCHED_WALT + else + account = false; +#endif irq_time_write_end(); +#ifdef CONFIG_SCHED_WALT + if (account) + walt_account_irqtime(cpu, curr, delta, wallclock); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8d7ef4cd3951f4..8734f11d3ce89e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,6 +61,8 @@ unsigned int sysctl_sched_cstate_aware = 1; #ifdef CONFIG_SCHED_WALT unsigned int sysctl_sched_use_walt_cpu_util = 1; unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); #endif /* * The initial- and re-scaling of tunables is configurable @@ -4089,7 +4091,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) schedtune_enqueue_task(p, cpu_of(rq)); #endif /* CONFIG_SMP */ - hrtick_update(rq); } @@ -5453,6 +5454,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util > capacity_orig_of(i)) continue; +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif /* * For boosted tasks we favor idle cpus unconditionally to * improve latency. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0684bcb9b634e5..a114146675d69a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -661,6 +661,9 @@ struct rq { u64 prev_runnable_sum; u64 nt_curr_runnable_sum; u64 nt_prev_runnable_sum; + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; #endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 1dff3d2e2358dc..b9ae8d5c4393ad 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -221,6 +221,71 @@ static int cpu_is_waiting_on_io(struct rq *rq) return atomic_read(&rq->nr_iowait); } +void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + + +#define WALT_HIGH_IRQ_TIMEOUT 3 + +u64 walt_irqload(int cpu) { + struct rq *rq = cpu_rq(cpu); + s64 delta; + delta = get_jiffies_64() - rq->irqload_ts; + + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < WALT_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +int walt_cpu_high_irqload(int cpu) { + return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload; +} + static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, u64 irqtime, int event) { diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index cabc193a683d56..e181c87a928d89 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -31,6 +31,11 @@ void walt_set_window_start(struct rq *rq); void walt_migrate_sync_cpu(int cpu); void walt_init_cpu_efficiency(void); u64 walt_ktime_clock(void); +void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta, + u64 wallclock); + +u64 walt_irqload(int cpu); +int walt_cpu_high_irqload(int cpu); #else /* CONFIG_SCHED_WALT */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1bb0cd6bf6bf17..e00614d747ad3a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -334,6 +334,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_walt_cpu_high_irqload", + .data = &sysctl_sched_walt_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { .procname = "sched_sync_hint_enable", From 0ae57dc2aae35102e0c475acb0dcc22ef4183edf Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 29 Jun 2016 11:30:07 -0700 Subject: [PATCH 244/420] sched: EAS: Avoid causing spikes to max-freq unnecessarily During scheduler tick handling, the frequency was being set to max-freq if the current frequency is less than the current utilization. Change to just request "right" frequency instead of max. BUG: 29871410 Change-Id: I6fe65b14413da44b1520ba116f72320083eb92f8 --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f071ed51ff361..7a0372796fa8a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2809,7 +2809,7 @@ static unsigned long sum_capacity_reqs(unsigned long cfs_cap, static void sched_freq_tick(int cpu) { struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr; + unsigned long capacity_orig, capacity_curr, capacity_sum; if (!sched_freq()) return; @@ -2822,12 +2822,15 @@ static void sched_freq_tick(int cpu) /* * To make free room for a task that is building up its "real" * utilization and to harm its performance the least, request - * a jump to max OPP as soon as the margin of free capacity is - * impacted (specified by capacity_margin). + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) - set_cfs_cpu_capacity(cpu, true, capacity_max); + capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); + if (capacity_curr < capacity_sum) { + set_cfs_cpu_capacity(cpu, true, capacity_sum); + } } #else static inline void sched_freq_tick(int cpu) { } From 4e1e849c2eb37835c2df0e791ccd6c584de662eb Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:00:41 +0100 Subject: [PATCH 245/420] FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use The CPU utilization reported when WALT is in use already tracks the contributions due to RT and DL workloads. However, SchedFreq exposes different capacity update functions, one for each class, and does classes utilization internally at update_cpu_capacity_request() call time. This patch ensures that when WALT is in use, the: cpu_sched_capacity_reqs::cfs value is tracking just the load generated by SCHED_OTHER tasks. Change-Id: Ibd9c9a10874a1d91f62477034548f7664e57cd6a Signed-off-by: Patrick Bellasi --- kernel/sched/sched.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a114146675d69a..6b35e3be1fffee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1551,8 +1551,27 @@ void update_cpu_capacity_request(int cpu, bool request); static inline void set_cfs_cpu_capacity(int cpu, bool request, unsigned long capacity) { - if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + int rtdl = scr->rt + scr->dl; + /* + * WALT tracks the utilization of a CPU considering the load + * generated by all the scheduling classes. + * Since the following call to: + * update_cpu_capacity + * is already adding the RT and DL utilizations let's remove + * these contributions from the WALT signal. + */ + if (capacity > rtdl) + capacity -= rtdl; + else + capacity = 0; + } +#endif + if (scr->cfs != capacity) { + scr->cfs = capacity; update_cpu_capacity_request(cpu, request); } } From 82b9fa65fab1a7f2c0d3c3358179243341bd237a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:09:24 +0100 Subject: [PATCH 246/420] FIXUP: sched: fix SchedFreq integration for both PELT and WALT The current kernel allows to use either PELT or WALT to track CPUs utilizations. One of the main differences between the two approaches is that PELT tracks only utilization of SCHED_OTHER classes while WALT tracks all tasks with a single signal. The current sched_freq_tick does not make this distinction and, when WALT is in use, we end up adding multiple time the contribution related to the RT and DL classes. This patch fixes this issue by: 1. providing two different code paths for PELT and WALT, thus granting that when we switch to PELT we get the original behaviour based on the assumption that class aggregations is done underneath by SchedFreq. 2. avoiding the double accounting of DL and RT workloads, when WALT is in use, by just adding a margin to the original WALT signal when we need to check if the CFS capacity has to be increased. Change-Id: I7326fd50e868e97fb5e12351917e9d2969bfdae7 Signed-off-by: Patrick Bellasi --- kernel/sched/core.c | 87 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a0372796fa8a8..12b5e44bd3690b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2795,28 +2795,31 @@ unsigned long long task_sched_runtime(struct task_struct *p) } #ifdef CONFIG_CPU_FREQ_GOV_SCHED -static unsigned long sum_capacity_reqs(unsigned long cfs_cap, - struct sched_capacity_reqs *scr) + +static inline +unsigned long add_capacity_margin(unsigned long cpu_capacity) { - unsigned long total = cfs_cap + scr->rt; + cpu_capacity = cpu_capacity * capacity_margin; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; +} - total = total * capacity_margin; - total /= SCHED_CAPACITY_SCALE; - total += scr->dl; - return total; +static inline +unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = add_capacity_margin(cfs_cap + scr->rt); + return total += scr->dl; } -static void sched_freq_tick(int cpu) +static void sched_freq_tick_pelt(int cpu) { + unsigned long cpu_utilization = capacity_max; + unsigned long capacity_curr = capacity_curr_of(cpu); struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr, capacity_sum; - if (!sched_freq()) - return; - - capacity_orig = capacity_orig_of(cpu); - capacity_curr = capacity_curr_of(cpu); - if (capacity_curr == capacity_orig) + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) return; /* @@ -2825,16 +2828,58 @@ static void sched_freq_tick(int cpu) * a jump to a higher OPP as soon as the margin of free capacity * is impacted (specified by capacity_margin). */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); +} + +#ifdef CONFIG_SCHED_WALT +static void sched_freq_tick_walt(int cpu) +{ + unsigned long cpu_utilization = cpu_util(cpu); + unsigned long capacity_curr = capacity_curr_of(cpu); + + if (walt_disabled || !sysctl_sched_use_walt_cpu_util) + return sched_freq_tick_pelt(cpu); + + /* + * Add a margin to the WALT utilization. + * NOTE: WALT tracks a single CPU signal for all the scheduling + * classes, thus this margin is going to be added to the DL class as + * well, which is something we do not do in sched_freq_tick_pelt case. + */ + cpu_utilization = add_capacity_margin(cpu_utilization); + if (cpu_utilization <= capacity_curr) + return; + + /* + * It is likely that the load is growing so we + * keep the added margin in our request as an + * extra boost. + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); - if (capacity_curr < capacity_sum) { - set_cfs_cpu_capacity(cpu, true, capacity_sum); - } +} +#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) +#else +#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) +#endif /* CONFIG_SCHED_WALT */ + +static void sched_freq_tick(int cpu) +{ + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + _sched_freq_tick(cpu); } #else static inline void sched_freq_tick(int cpu) { } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ /* * This function gets called by the timer code, with HZ frequency. From 6d1a1e78e5acb2a61b69cc5cad046bb4449d010d Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 16 Jun 2016 16:33:54 -0700 Subject: [PATCH 247/420] FIXUP: sched/fair: Fix hang during suspend in sched_group_energy BUG: 29353986 Change-Id: I0d0d8d5c107a2e0bd219819e036091106bb40e11 --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8734f11d3ce89e..879266b897bf03 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4806,6 +4806,7 @@ static int sched_group_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); continue; } From ab1b90f03a063f4ef9899835e9d04d7deeb5f28f Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 4 Jul 2016 15:04:45 +0100 Subject: [PATCH 248/420] FIXUP: sched: Fix double-release of spinlock in move_queued_task BUG: 29519455 Change-Id: I4d1c27a1b4bcbba03d4b175d170cfe1701a90ffd --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b35e3be1fffee..6a06df0bff9d38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1692,7 +1692,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); + if (this_rq != busiest) + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } From 2b89ec8f2a0eedebd8b7a965a56a77abc3116894 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Wed, 13 Jul 2016 17:45:49 -0700 Subject: [PATCH 249/420] sched/rt: Avoid moving rt task if destination CPU does not run low priority task bug: 29512132 bug: 30115868 Change-Id: Id18083402dfe2324f86c34a4e5d32fb7e0d11d04 --- kernel/sched/rt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b33d7bad868610..14a5c006bea77c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1344,7 +1344,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr->prio <= p->prio)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Possible race. Don't bother moving it if the + * destination CPU is not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } rcu_read_unlock(); From 87e43eb0ec8bda085729174a1e795b8f6fd486f2 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 25 Jul 2016 15:13:58 +0100 Subject: [PATCH 250/420] arch_timer: add error handling when the MPM global timer is cleared Bug: 29000863 Signed-off-by: albert.zl_huang Change-Id: I2b5a28b0a9edb31bdaa1ca2310397dd2f36f6c23 Updated to use arch_timer_read_counter() as arch_counter_get_cntvct doesn't exist in this kernel. Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index b9ae8d5c4393ad..d9d09914ce30f3 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -185,7 +185,14 @@ update_window_start(struct rq *rq, u64 wallclock) int nr_windows; delta = wallclock - rq->window_start; - BUG_ON(delta < 0); + /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ + if (delta < 0) { + if (arch_timer_read_counter() == 0) + delta = 0; + else + BUG_ON(1); + } + if (delta < walt_ravg_window) return; From 84b0bf04a32b31b283e2559fb43055600219918e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 13 Jul 2016 16:13:47 -0700 Subject: [PATCH 251/420] sched: use util instead of capacity to select busy cpu If cpus are busy, the cpu selection algorithm was favoring cpus with lower capacity. This can result in uneven packing since there will be a bias toward the same cpu until there is a capacity change. Instead use the utilization so there is immediate feedback as tasks are assigned BUG: 30115868 Change-Id: I0ac7ae3ab5d8f2f5a5838c29bb6da2c3e8ef44e8 --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 879266b897bf03..5faf2cead8eaf8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5419,7 +5419,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) { int iter_cpu; int target_cpu = -1; - int target_capacity = 0; + int target_util = 0; int backup_capacity = 0; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; @@ -5475,10 +5475,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_capacity == 0 || - target_capacity > cur_capacity) { + if (target_util == 0 || + target_util > new_util) { target_cpu = i; - target_capacity = cur_capacity; + target_util = new_util; } } else if (!boosted) { if (best_idle_cpu < 0 || From dc9fee155b4c60af46be94e058ef1aa857470b59 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 13:09:03 -0700 Subject: [PATCH 252/420] sched/tune: Introducing a new schedtune attribute prefer_idle Hint to enable biasing of tasks towards idle cpus, even when a given task is negatively boosted. The mechanism allows upto 20% reduction in camera power without hurting performance. bug: 28312446 Change-Id: I97ea5671aa1e6bcb165408b41e17bc82e41c2c9e --- kernel/sched/fair.c | 23 +++++++++++++---------- kernel/sched/tune.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 2 ++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5faf2cead8eaf8..c0ee9075acaec5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5415,7 +5415,7 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } -static inline int find_best_target(struct task_struct *p, bool boosted) +static inline int find_best_target(struct task_struct *p, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5433,9 +5433,9 @@ static inline int find_best_target(struct task_struct *p, bool boosted) int idle_idx; /* - * favor higher cpus for boosted tasks + * favor higher cpus for tasks that prefer idle cores */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5460,10 +5460,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) continue; #endif /* - * For boosted tasks we favor idle cpus unconditionally to + * Unconditionally favoring tasks that prefer idle cpus to * improve latency. */ - if (idle_cpu(i) && boosted) { + if (idle_cpu(i) && prefer_idle) { if (best_idle_cpu < 0) best_idle_cpu = i; continue; @@ -5480,7 +5480,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) target_cpu = i; target_util = new_util; } - } else if (!boosted) { + } else if (!prefer_idle) { if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { @@ -5495,7 +5495,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) } } - if (boosted && best_idle_cpu >= 0) + if (prefer_idle && best_idle_cpu >= 0) target_cpu = best_idle_cpu; else if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; @@ -5587,14 +5587,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) */ #ifdef CONFIG_CGROUP_SCHEDTUNE bool boosted = schedtune_task_boost(p) > 0; + bool prefer_idle = schedtune_prefer_idle(p) > 0; #else bool boosted = 0; + bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted); - if (tmp_target >= 0) + int tmp_target = find_best_target(p, boosted || prefer_idle); + if (tmp_target >= 0) { target_cpu = tmp_target; - if (boosted && idle_cpu(target_cpu)) + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) return target_cpu; + } } if (target_cpu != task_cpu(p)) { diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 1eda155f7497d2..7f054bc2a2479e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -125,6 +125,10 @@ struct schedtune { /* Performance Constraint (C) region threshold params */ int perf_constrain_idx; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards idle CPUs */ + int prefer_idle; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -156,6 +160,7 @@ root_schedtune = { .boost = 0, .perf_boost_idx = 0, .perf_constrain_idx = 0, + .prefer_idle = 0, }; int @@ -537,6 +542,38 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } +int schedtune_prefer_idle(struct task_struct *p) +{ + struct schedtune *st; + int prefer_idle; + + /* Get prefer_idle value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_idle = st->prefer_idle; + rcu_read_unlock(); + + return prefer_idle; +} + +static u64 +prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_idle; +} + +static int +prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_idle) +{ + struct schedtune *st = css_st(css); + st->prefer_idle = prefer_idle; + + return 0; +} + static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -592,6 +629,11 @@ static struct cftype files[] = { .read_s64 = boost_read, .write_s64 = boost_write, }, + { + .name = "prefer_idle", + .read_u64 = prefer_idle_read, + .write_u64 = prefer_idle_write, + }, { } /* terminate */ }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index cac1e9a0d26184..a00fb326cb08de 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +int schedtune_prefer_idle(struct task_struct *tsk); + void schedtune_exit_task(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); From 100f5e0c8f265dfc2845a19227ecb7449892fb51 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 10 Feb 2016 09:24:36 +0000 Subject: [PATCH 253/420] DEBUG: sched: add tracepoint for RD overutilized Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 36477b7c1ecc63..6505343ba812da 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -938,6 +938,26 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +/* + * Tracepoint for system overutilized flag + */ +TRACE_EVENT(sched_overutilized, + + TP_PROTO(bool overutilized), + + TP_ARGS(overutilized), + + TP_STRUCT__entry( + __field( bool, overutilized ) + ), + + TP_fast_assign( + __entry->overutilized = overutilized; + ), + + TP_printk("overutilized=%d", + __entry->overutilized ? 1 : 0) +); #ifdef CONFIG_SCHED_WALT struct rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0ee9075acaec5..82fd5d5098a6ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4073,8 +4073,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) + cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } /* * We want to potentially trigger a freq switch @@ -7351,12 +7353,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) + if (env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } } else { - if (!env->dst_rq->rd->overutilized && overutilized) + if (!env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } + } /** @@ -8797,8 +8804,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } rq->misfit_task = !task_fits_max(curr, rq->cpu); #endif From b8351ea7730f0425efd5c35b560cd60248cbf119 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 16:09:03 +0100 Subject: [PATCH 254/420] FIXUP: sched/tune: do initialization as a postcore_initicall SchedTune needs to walk the scheduling domains to compute the energy normalization constants used for PE space filtering. To build such constants we need the energy model data for each CPU in the system. However, by walking the SDs as a late initcall stage, the userspace has been already initialized and it could happen that some CPUs are hotplugged out. For example, this could happen if a user-space thermal manager daemon detects that CPUs are to much hot during the boot process. To avoid such a race condition we can anticipate the SchedTune initialization code to be a postcore_initicall. This allows to keep the SchedTune initialization code as simple as an initcall while still safely relaying on SDs provided data. Such calls are executed before user-space is initialized and thus, apart from the case of unlucky early-init kernel space generated hotplugs, this solution should be safe enough to get all the data we need. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 7f054bc2a2479e..e795c828a72818 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -963,5 +963,5 @@ schedtune_init(void) rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init); +postcore_initcall(schedtune_init); From 16b323390bced7ba00f5c99d33da473430905984 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Tue, 2 Aug 2016 14:05:46 -0700 Subject: [PATCH 255/420] sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs When idle cpus cannot be found for Top-app/FG tasks, the cpu selection algorithm picks a cpu with lowest OPP amongst the busy cpus as a second choice. Mitigates the "runnable" time for ui and render threads. bug: 30481949 bug: 30342017 bug: 30508678 Change-Id: I5a97e31d33284895c0fa6f6942102713ee576d77 --- kernel/sched/fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82fd5d5098a6ea..ef65a8f9324174 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5477,10 +5477,22 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; + if(prefer_idle) { + // Find a target cpu with lowest + // utilization. + if (target_util == 0 || + target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + // Find a target cpu with highest + // utilization. + if (target_util == 0 || + target_util > new_util) { + target_cpu = i; + target_util = new_util; + } } } else if (!prefer_idle) { if (best_idle_cpu < 0 || @@ -5492,6 +5504,7 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { + // Find a backup cpu with least capacity. backup_capacity = cur_capacity; backup_cpu = i; } From ea6b9718c5b074c1e18af18f3feb1a1e5bfc82ef Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 4 Aug 2016 12:20:04 +0100 Subject: [PATCH 256/420] sched/cpufreq_sched: fix thermal capping events cpufreq_sched_limits (called when CPUFREQ_GOV_LIMITS event happens) bails out if policy->rwsem is already locked. However, that rwsem is always guaranteed to be locked when we get here after a thermal throttling event happens: th_throttling -> cpufreq_update_policy() ... down_write(&policy->rwsem); ... cpufreq_set_policy() -> ... __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); -> cpufreq_sched_limits() ... if (!down_write_trylock(&policy->rwsem)) return; <-- BAIL OUT! So, we don't currently react immediately to thermal capping event (even if reaction is still quick in practice, ~1ms, as lots of events are likely to trigger a frequency selection on a high loaded system). Fix this bug by removing the bail out condition. While we are at it we also slightly change handling of the new limits by clamping the last requested_freq between policy's max and min. Doing so gives us the oppurtunity to correctly restore the last requested frequency as soon as a thermal unthrottling event happens. bug: 30481949 Change-Id: I3c13e818f238c1ffa66b34e419e8b87314b57427 Suggested-by: Javi Merino Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan --- kernel/sched/cpufreq_sched.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index e5ca1e3fba8e74..8fe35577a131d9 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -58,7 +58,6 @@ struct gov_data { struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; - int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -195,7 +194,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) @@ -291,7 +290,6 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) __func__, gd->up_throttle_nsec); policy->governor_data = gd; - gd->max = policy->max; rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); if (rc) { @@ -355,28 +353,17 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) static void cpufreq_sched_limits(struct cpufreq_policy *policy) { - struct gov_data *gd; + unsigned int clamp_freq; + struct gov_data *gd = policy->governor_data;; pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", policy->cpu, policy->min, policy->max, policy->cur); - if (!down_write_trylock(&policy->rwsem)) - return; - /* - * Need to keep track of highest max frequency for - * capacity calculations - */ - gd = policy->governor_data; - if (gd->max < policy->max) - gd->max = policy->max; - - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); - up_write(&policy->rwsem); + if (policy->cur != clamp_freq) + __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); } static int cpufreq_sched_stop(struct cpufreq_policy *policy) From 6545c09e1476dd737f0701755e58d18d1e4b7f8e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:13 +0100 Subject: [PATCH 257/420] UPSTREAM: sched: Add SD_PREFER_SIBLING for SMT level Add the SD_PREFER_SIBLING flag for SMT level in order to ensure that the scheduler will place at least one task per core. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Preeti U. Murthy Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (am from https://patchwork.kernel.org/patch/5901561) Signed-off-by: Juri Lelli BUG=chromium:48849 TEST=build and boot tested Change-Id: I9f76da5e5bad337794ee8b84ccaa283904a12233 Reviewed-on: https://chromium-review.googlesource.com/271864 Reviewed-by: Ricky Liang Tested-by: Ricky Liang Commit-Queue: Ricky Liang Trybot-Ready: Ricky Liang --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12b5e44bd3690b..87ac2a5b4cefeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6543,6 +6543,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ From 224dacaa650b19d59d2e0fbd3b61b11c0b7cac06 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 Jul 2015 06:11:51 +0800 Subject: [PATCH 258/420] UPSTREAM: sched/fair: Avoid pulling all tasks in idle balancing In idle balancing where a CPU going idle pulls tasks from another CPU, a livelock may happen if the CPU pulls all tasks from another, makes it idle, and this iterates. So just avoid this. Reported-by: Rabin Vincent Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Linus Torvalds Cc: Mike Galbraith Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150705221151.GF5197@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 985d3a4c11cd28251bcc7925aa2d7a9038910384) Signed-off-by: Ricky Liang BUG=chrome-os-partner:45410 TEST=Boot kernel on Oak. Run tests for days and doesn't see kernel lock-ups. Change-Id: I4e39dd14189fbf27afcc64de255f7d1b7b822b34 Reviewed-on: https://chromium-review.googlesource.com/309510 Commit-Ready: Ricky Liang Tested-by: Ricky Liang Reviewed-by: Daniel Kurtz --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef65a8f9324174..53551ce891907a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6548,6 +6548,13 @@ static int detach_tasks(struct lb_env *env) return 0; while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, + * which could at worst lead to a livelock crash. + */ + if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + break; + p = list_first_entry(tasks, struct task_struct, se.group_node); env->loop++; From dd3a12df2d6d58a56635d001a13c2f2403637df8 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 13 May 2016 11:54:04 +0100 Subject: [PATCH 259/420] sched/fair: call OPP update when going idle after migration When a task leaves a rq because it is migrated away it carries its utilization with him. In this case and OPP update on the src rq might be needed. The corresponding update at dst rq will happen at enqueue time. Change-Id: I22754a43760fc8d22a488fe15044af93787ea7a8 sched/fair: Fix uninitialised variable in idle_balance compiler warned, looks legit. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 53551ce891907a..838920847e0bf1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8177,6 +8177,7 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + long removed_util=0; idle_enter_fair(this_rq); @@ -8203,6 +8204,17 @@ static int idle_balance(struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); + /* + * If removed_util_avg is !0 we most probably migrated some task away + * from this_cpu. In this case we might be willing to trigger an OPP + * update, but we want to do so if we don't find anybody else to pull + * here (we will trigger an OPP update with the pulled task's enqueue + * anyway). + * + * Record removed_util before calling update_blocked_averages, and use + * it below (before returning) to see if an OPP update is required. + */ + removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -8267,6 +8279,12 @@ static int idle_balance(struct rq *this_rq) if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } else if (removed_util) { + /* + * No task pulled and someone has been migrated away. + * Good case to trigger an OPP update. + */ + update_capacity_of(this_cpu); } return pulled_task; From dc76226d9fc0e07c206596bdea4b7614bbe777c9 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Thu, 26 Mar 2015 18:32:44 +0530 Subject: [PATCH 260/420] UPSTREAM: sched: Improve load balancing in the presence of idle CPUs When a CPU is kicked to do nohz idle balancing, it wakes up to do load balancing on itself, followed by load balancing on behalf of idle CPUs. But it may end up with load after the load balancing attempt on itself. This aborts nohz idle balancing. As a result several idle CPUs are left without tasks till such a time that an ILB CPU finds it unfavorable to pull tasks upon itself. This delays spreading of load across idle CPUs and worse, clutters only a few CPUs with tasks. The effect of the above problem was observed on an SMT8 POWER server with 2 levels of numa domains. Busy loops equal to number of cores were spawned. Since load balancing on fork/exec is discouraged across numa domains, all busy loops would start on one of the numa domains. However it was expected that eventually one busy loop would run per core across all domains due to nohz idle load balancing. But it was observed that it took as long as 10 seconds to spread the load across numa domains. Further investigation showed that this was a consequence of the following: 1. An ILB CPU was chosen from the first numa domain to trigger nohz idle load balancing [Given the experiment, upto 6 CPUs per core could be potentially idle in this domain.] 2. However the ILB CPU would call load_balance() on itself before initiating nohz idle load balancing. 3. Given cores are SMT8, the ILB CPU had enough opportunities to pull tasks from its sibling cores to even out load. 4. Now that the ILB CPU was no longer idle, it would abort nohz idle load balancing As a result the opportunities to spread load across numa domains were lost until such a time that the cores within the first numa domain had equal number of tasks among themselves. This is a pretty bad scenario, since the cores within the first numa domain would have as many as 4 tasks each, while cores in the neighbouring numa domains would all remain idle. Fix this, by checking if a CPU was woken up to do nohz idle load balancing, before it does load balancing upon itself. This way we allow idle CPUs across the system to do load balancing which results in quicker spread of load, instead of performing load balancing within the local sched domain hierarchy of the ILB CPU alone under circumstances such as above. Signed-off-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jason Low Cc: benh@kernel.crashing.org Cc: daniel.lezcano@linaro.org Cc: efault@gmx.de Cc: iamjoonsoo.kim@lge.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: riel@redhat.com Cc: srikar@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: tim.c.chen@linux.intel.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20150326130014.21532.17158.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar (am from https://patchwork.kernel.org/patch/6098991) Signed-off-by: Juri Lelli BUG=chromium:48849 TEST=build and boot tested Change-Id: I7719f1bf32926a2c1925f070452e36549f95e5b6 Reviewed-on: https://chromium-review.googlesource.com/271867 Reviewed-by: Ricky Liang Tested-by: Ricky Liang Commit-Queue: Ricky Liang Trybot-Ready: Ricky Liang --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 838920847e0bf1..95bba5310763b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8781,14 +8781,16 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_rq, idle); - /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are - * stopped. + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle cpus a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + rebalance_domains(this_rq, idle); } /* From 4681963c5dd89807e99c91cd224a472a02a58626 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Feb 2015 23:33:15 +0100 Subject: [PATCH 261/420] UPSTREAM: PM / sleep: Re-implement suspend-to-idle handling In preparation for adding support for quiescing timers in the final stage of suspend-to-idle transitions, rework the freeze_enter() function making the system wait on a wakeup event, the freeze_wake() function terminating the suspend-to-idle loop and the mechanism by which deep idle states are entered during suspend-to-idle. First of all, introduce a simple state machine for suspend-to-idle and make the code in question use it. Second, prevent freeze_enter() from losing wakeup events due to race conditions and ensure that the number of online CPUs won't change while it is being executed. In addition to that, make it force all of the CPUs re-enter the idle loop in case they are in idle states already (so they can enter deeper idle states if possible). Next, drop cpuidle_use_deepest_state() and replace use_deepest_state checks in cpuidle_select() and cpuidle_reflect() with a single suspend-to-idle state check in cpuidle_idle_call(). Finally, introduce cpuidle_enter_freeze() that will simply find the deepest idle state available to the given CPU and enter it using cpuidle_enter(). Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Link: git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm br: linux-next (cherry picked from commit 3810631332465d967ba5e27ea2c7dff2c9afac6c) BUG=None TEST=1) Change power state of system to "freeze" 2) Use echo freeze > /sys/power/state 3) Check whether system wakes up on any interrupt(set rtc timer) Change-Id: I895e0649549abdff8d618fb0967f32d550fd2b39 Signed-off-by: Jay Patel Reviewed-on: https://chromium-review.googlesource.com/282852 Reviewed-by: Eric Caruso Tested-by: Prathyushi Nangia --- drivers/cpuidle/cpuidle.c | 49 +++++++++++++++++++++------------------ include/linux/cpuidle.h | 4 ++-- include/linux/suspend.h | 16 +++++++++++++ kernel/power/suspend.c | 43 ++++++++++++++++++++++++++++------ kernel/sched/idle.c | 16 +++++++++++++ 5 files changed, 96 insertions(+), 32 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f2d07a03330a9e..42c2266738a528 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "cpuidle.h" @@ -32,7 +33,6 @@ LIST_HEAD(cpuidle_detected_devices); static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; -static bool use_deepest_state __read_mostly; int cpuidle_disabled(void) { @@ -66,24 +66,9 @@ int cpuidle_play_dead(void) } /** - * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode. - * @enable: Whether enable or disable the feature. - * - * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and - * always use the state with the greatest exit latency (out of the states that - * are not disabled). - * - * This function can only be called after cpuidle_pause() to avoid races. - */ -void cpuidle_use_deepest_state(bool enable) -{ - use_deepest_state = enable; -} - -/** - * cpuidle_find_deepest_state - Find the state of the greatest exit latency. - * @drv: cpuidle driver for a given CPU. - * @dev: cpuidle device for a given CPU. + * cpuidle_find_deepest_state - Find deepest state meeting specific conditions. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. */ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -104,6 +89,27 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return ret; } +/** + * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle. + * + * Find the deepest state available and enter it. + */ +void cpuidle_enter_freeze(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int index; + + index = cpuidle_find_deepest_state(drv, dev); + if (index >= 0) + cpuidle_enter(drv, dev, index); + else + arch_cpu_idle(); + + /* Interrupts are enabled again here. */ + local_irq_disable(); +} + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -172,9 +178,6 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (!drv || !dev || !dev->enabled) return -EBUSY; - if (unlikely(use_deepest_state)) - return cpuidle_find_deepest_state(drv, dev); - return cpuidle_curr_governor->select(drv, dev); } @@ -206,7 +209,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ void cpuidle_reflect(struct cpuidle_device *dev, int index) { - if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state)) + if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev, index); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 272170116754a8..2d988de073ff44 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -144,7 +144,7 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); -extern void cpuidle_use_deepest_state(bool enable); +extern void cpuidle_enter_freeze(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else @@ -177,7 +177,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } -static inline void cpuidle_use_deepest_state(bool enable) {} +static inline void cpuidle_enter_freeze(void) { } static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 16f208f40eeff3..57b77c1cf76ddf 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -201,6 +201,21 @@ struct platform_freeze_ops { */ extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); + +/* Suspend-to-idle state machnine. */ +enum freeze_state { + FREEZE_STATE_NONE, /* Not suspended/suspending. */ + FREEZE_STATE_ENTER, /* Enter suspend-to-idle. */ + FREEZE_STATE_WAKE, /* Wake up from suspend-to-idle. */ +}; + +extern enum freeze_state __read_mostly suspend_freeze_state; + +static inline bool idle_should_freeze(void) +{ + return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER); +} + extern void freeze_set_ops(const struct platform_freeze_ops *ops); extern void freeze_wake(void); @@ -228,6 +243,7 @@ extern int pm_suspend(suspend_state_t state); static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } +static inline bool idle_should_freeze(void) { return false; } static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {} static inline void freeze_wake(void) {} #endif /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index a212348cbfbfba..48048acd06ccd4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -39,7 +39,9 @@ const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); -static bool suspend_freeze_wake; + +enum freeze_state __read_mostly suspend_freeze_state; +static DEFINE_SPINLOCK(suspend_freeze_lock); void freeze_set_ops(const struct platform_freeze_ops *ops) { @@ -50,22 +52,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops) static void freeze_begin(void) { - suspend_freeze_wake = false; + suspend_freeze_state = FREEZE_STATE_NONE; } static void freeze_enter(void) { - cpuidle_use_deepest_state(true); + spin_lock_irq(&suspend_freeze_lock); + if (pm_wakeup_pending()) + goto out; + + suspend_freeze_state = FREEZE_STATE_ENTER; + spin_unlock_irq(&suspend_freeze_lock); + + get_online_cpus(); cpuidle_resume(); - wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + + /* Push all the CPUs into the idle loop. */ + wake_up_all_idle_cpus(); + pr_debug("PM: suspend-to-idle\n"); + /* Make the current CPU wait so it can enter the idle loop too. */ + wait_event(suspend_freeze_wait_head, + suspend_freeze_state == FREEZE_STATE_WAKE); + pr_debug("PM: resume from suspend-to-idle\n"); + cpuidle_pause(); - cpuidle_use_deepest_state(false); + put_online_cpus(); + + spin_lock_irq(&suspend_freeze_lock); + + out: + suspend_freeze_state = FREEZE_STATE_NONE; + spin_unlock_irq(&suspend_freeze_lock); } void freeze_wake(void) { - suspend_freeze_wake = true; - wake_up(&suspend_freeze_wait_head); + unsigned long flags; + + spin_lock_irqsave(&suspend_freeze_lock, flags); + if (suspend_freeze_state > FREEZE_STATE_NONE) { + suspend_freeze_state = FREEZE_STATE_WAKE; + wake_up(&suspend_freeze_wait_head); + } + spin_unlock_irqrestore(&suspend_freeze_lock, flags); } EXPORT_SYMBOL_GPL(freeze_wake); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1c8473949b7804..a5ea0df42aa7db 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -113,6 +114,21 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + /* + * Suspend-to-idle ("freeze") is a system state in which all user space + * has been frozen, all I/O devices have been suspended and the only + * activity happens here and in iterrupts (if any). In that case bypass + * the cpuidle governor and go stratight for the deepest idle state + * available. Possibly also suspend the local tick and the entire + * timekeeping to prevent timer interrupts from kicking us out of idle + * until a proper wakeup interrupt happens. + */ + if (idle_should_freeze()) { + cpuidle_enter_freeze(); + local_irq_enable(); + goto exit_idle; + } + /* * Ask the cpuidle framework to choose a convenient idle state. * Fall back to the default arch idle method on errors. From e50bfac8d1fd012a54ca8c0f46ec90ebe3d550a0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 2 Mar 2015 22:25:37 +0100 Subject: [PATCH 262/420] UPSTREAM: cpuidle: Clean up fallback handling in cpuidle_idle_call() Move the fallback code path in cpuidle_idle_call() to the end of the function to avoid jumping to a label in an if () branch. Signed-off-by: Rafael J. Wysocki Link: git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm br: linux-next (cherry picked from commit dfcacc154fb38fdb2c243c3dbbdc1f26a64cedc8) BUG=None TEST=1) Change power state of system to "freeze" 2) Use echo freeze > /sys/power/state 3) Check whether system wakes up on any interrupt(set rtc timer) Change-Id: I4b9af89afd369d302d27ad77672d1256f436b54d Signed-off-by: Jay Patel Reviewed-on: https://chromium-review.googlesource.com/282860 Reviewed-by: Eric Caruso Tested-by: Prathyushi Nangia --- kernel/sched/idle.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index a5ea0df42aa7db..e46ee5959214b2 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -134,20 +134,8 @@ static void cpuidle_idle_call(void) * Fall back to the default arch idle method on errors. */ next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) - local_irq_enable(); - else - arch_cpu_idle(); - - goto exit_idle; - } - + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -205,6 +193,19 @@ static void cpuidle_idle_call(void) rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } /* From 90e6442ece643d7a7b965aed81eb834e804aacb9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: [PATCH 263/420] vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Change-Id: Idf256cfacb40b4dc8dbb6795cf06b34e8fec7a06 Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Git-commit: 0eb77e9880321915322d42913c3b53241739c8aa [shashim@codeaurora.org: resolve minor merge conflicts] Signed-off-by: Shiraz Hashim --- include/linux/vmstat.h | 2 ++ kernel/sched/idle.c | 1 + mm/vmstat.c | 73 ++++++++++++++++++++++++++---------------- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 82e7db7f7100f9..c013b8d8e43407 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -211,6 +211,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); +void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); @@ -272,6 +273,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_cpu_vm_stats(int cpu) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } +static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index e46ee5959214b2..c0f307c4734cfd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -226,6 +226,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 1b12d390dc6815..87197762e95236 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -455,7 +455,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -479,33 +479,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; - - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1259,7 +1261,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1267,7 +1269,7 @@ static void vmstat_update(struct work_struct *w) */ schedule_delayed_work(this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); - else { + } else { /* * We did not update any counters so the app may be in * a mode where it does not cause counter updates. @@ -1289,6 +1291,23 @@ static void vmstat_update(struct work_struct *w) } } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1321,7 +1340,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { From 6ee164dc3d100cebf5ebf0598db9ef9be508866e Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 26 Nov 2014 08:44:06 +0800 Subject: [PATCH 264/420] sched: Fix hrtick_start() on UP commit 868933359a3bdda25b562e9d41bce7071edc1b08 upstream. The commit 177ef2a6315e ("sched/deadline: Fix a precision problem in the microseconds range") forgot to change the UP version of hrtick_start(), do so now. Signed-off-by: Wanpeng Li Fixes: 177ef2a6315e ("sched/deadline: Fix a precision problem in the microseconds range") [ Fixed the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1416962647-76792-7-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 87ac2a5b4cefeb..ca2d352d931fe3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -503,6 +503,11 @@ static __init void init_hrtick(void) */ void hrtick_start(struct rq *rq, u64 delay) { + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + delay = max_t(u64, delay, 10000LL); __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } From 399c6fb4c38d26f598f255b88949b1772badefaf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Feb 2015 11:53:18 +0100 Subject: [PATCH 265/420] sched/autogroup: Fix failure to set cpu.rt_runtime_us commit 1fe89e1b6d270aa0d3452c60d38461ea589594e3 upstream. Because task_group() uses a cache of autogroup_task_group(), whose output depends on sched_class, switching classes can generate problems. In particular, when started as fair, the cache points to the autogroup, so when switching to RT the tg_rt_schedulable() test fails for every cpu.rt_{runtime,period}_us change because now the autogroup has tasks and no runtime. Furthermore, going back to the previous semantics of varying task_group() with sched_class has the down-side that the sched_debug output varies as well, even though the task really is in the autogroup. Therefore add an autogroup exception to tg_has_rt_tasks() -- such that both (all) task_group() usages in sched/core now have one. And remove all the remnants of the variable task_group() output. Reported-by: Zefan Li Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Stefan Bader Fixes: 8323f26ce342 ("sched: Fix race in task_group()") Link: http://lkml.kernel.org/r/20150209112237.GR5029@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/auto_group.c | 6 +----- kernel/sched/core.c | 6 ++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 077976d30e8843..a8653c2e295567 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) * so we don't have to move tasks around upon policy change, * or flail around trying to allocate bandwidth on the fly. * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. + * the policy change to proceed. */ free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; @@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) if (tg != &root_task_group) return false; - if (p->sched_class != &fair_sched_class) - return false; - /* * We can only assume the task group can't go away on us if * autogroup_move_group() can see us on ->thread_group list. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca2d352d931fe3..ce82c8bf29bae0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7845,6 +7845,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + for_each_process_thread(g, p) { if (rt_task(p) && task_group(p) == tg) return 1; From f945a3c1ddf6d863fb4edc940ed0f3e7949a3a8b Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 22 Mar 2016 14:27:30 -0700 Subject: [PATCH 266/420] BACKPORT: kernel: add kcov code coverage (cherry picked from commit 5c9a8750a6409c63a0f01d51a9024861022f6593) kcov provides code coverage collection for coverage-guided fuzzing (randomized testing). Coverage-guided fuzzing is a testing technique that uses coverage feedback to determine new interesting inputs to a system. A notable user-space example is AFL (http://lcamtuf.coredump.cx/afl/). However, this technique is not widely used for kernel testing due to missing compiler and kernel support. kcov does not aim to collect as much coverage as possible. It aims to collect more or less stable coverage that is function of syscall inputs. To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic or non-interesting parts of kernel is disbled (e.g. scheduler, locking). Currently there is a single coverage collection mode (tracing), but the API anticipates additional collection modes. Initially I also implemented a second mode which exposes coverage in a fixed-size hash table of counters (what Quentin used in his original patch). I've dropped the second mode for simplicity. This patch adds the necessary support on kernel side. The complimentary compiler support was added in gcc revision 231296. We've used this support to build syzkaller system call fuzzer, which has found 90 kernel bugs in just 2 months: https://github.com/google/syzkaller/wiki/Found-Bugs We've also found 30+ bugs in our internal systems with syzkaller. Another (yet unexplored) direction where kcov coverage would greatly help is more traditional "blob mutation". For example, mounting a random blob as a filesystem, or receiving a random blob over wire. Why not gcov. Typical fuzzing loop looks as follows: (1) reset coverage, (2) execute a bit of code, (3) collect coverage, repeat. A typical coverage can be just a dozen of basic blocks (e.g. an invalid input). In such context gcov becomes prohibitively expensive as reset/collect coverage steps depend on total number of basic blocks/edges in program (in case of kernel it is about 2M). Cost of kcov depends only on number of executed basic blocks/edges. On top of that, kernel requires per-thread coverage because there are always background threads and unrelated processes that also produce coverage. With inlined gcov instrumentation per-thread coverage is not possible. kcov exposes kernel PCs and control flow to user-space which is insecure. But debugfs should not be mapped as user accessible. Based on a patch by Quentin Casasnovas. [akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode'] [akpm@linux-foundation.org: unbreak allmodconfig] [akpm@linux-foundation.org: follow x86 Makefile layout standards] Signed-off-by: Dmitry Vyukov Reviewed-by: Kees Cook Cc: syzkaller Cc: Vegard Nossum Cc: Catalin Marinas Cc: Tavis Ormandy Cc: Will Deacon Cc: Quentin Casasnovas Cc: Kostya Serebryany Cc: Eric Dumazet Cc: Alexander Potapenko Cc: Kees Cook Cc: Bjorn Helgaas Cc: Sasha Levin Cc: David Drysdale Cc: Ard Biesheuvel Cc: Andrey Ryabinin Cc: Kirill A. Shutemov Cc: Jiri Slaby Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: I2f274ed7783550eed023fc7daeaeaf80c14e5ce7 Conflicts: Makefile arch/x86/Kconfig arch/x86/boot/compressed/Makefile arch/x86/kernel/Makefile mm/Makefile mm/kasan/Makefile scripts/Makefile.lib --- Documentation/kcov.txt | 111 +++++++++++ Makefile | 11 +- arch/x86/Kconfig | 6 + arch/x86/boot/Makefile | 10 + arch/x86/boot/compressed/Makefile | 5 + arch/x86/kernel/Makefile | 10 + arch/x86/kernel/apic/Makefile | 4 + arch/x86/kernel/cpu/Makefile | 4 + arch/x86/lib/Makefile | 3 + arch/x86/mm/Makefile | 3 + arch/x86/realmode/rm/Makefile | 3 + drivers/firmware/efi/libstub/Makefile | 3 + include/linux/kcov.h | 29 +++ include/linux/sched.h | 11 ++ include/uapi/linux/kcov.h | 10 + kernel/Makefile | 12 ++ kernel/exit.c | 2 + kernel/fork.c | 3 + kernel/kcov.c | 273 ++++++++++++++++++++++++++ kernel/locking/Makefile | 3 + kernel/rcu/Makefile | 4 + kernel/sched/Makefile | 4 + lib/Kconfig.debug | 21 ++ lib/Makefile | 12 ++ mm/Makefile | 18 ++ mm/kasan/Makefile | 10 + scripts/Makefile.lib | 22 +++ 27 files changed, 606 insertions(+), 1 deletion(-) create mode 100644 Documentation/kcov.txt create mode 100644 include/linux/kcov.h create mode 100644 include/uapi/linux/kcov.h create mode 100644 kernel/kcov.c create mode 100644 mm/kasan/Makefile diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt new file mode 100644 index 00000000000000..779ff4ab1c1da0 --- /dev/null +++ b/Documentation/kcov.txt @@ -0,0 +1,111 @@ +kcov: code coverage for fuzzing +=============================== + +kcov exposes kernel code coverage information in a form suitable for coverage- +guided fuzzing (randomized testing). Coverage data of a running kernel is +exported via the "kcov" debugfs file. Coverage collection is enabled on a task +basis, and thus it can capture precise coverage of a single system call. + +Note that kcov does not aim to collect as much coverage as possible. It aims +to collect more or less stable coverage that is function of syscall inputs. +To achieve this goal it does not collect coverage in soft/hard interrupts +and instrumentation of some inherently non-deterministic parts of kernel is +disbled (e.g. scheduler, locking). + +Usage: +====== + +Configure kernel with: + + CONFIG_KCOV=y + +CONFIG_KCOV requires gcc built on revision 231296 or later. +Profiling data will only become accessible once debugfs has been mounted: + + mount -t debugfs none /sys/kernel/debug + +The following program demonstrates kcov usage from within a test program: + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) +#define KCOV_ENABLE _IO('c', 100) +#define KCOV_DISABLE _IO('c', 101) +#define COVER_SIZE (64<<10) + +int main(int argc, char **argv) +{ + int fd; + unsigned long *cover, n, i; + + /* A single fd descriptor allows coverage collection on a single + * thread. + */ + fd = open("/sys/kernel/debug/kcov", O_RDWR); + if (fd == -1) + perror("open"), exit(1); + /* Setup trace mode and trace size. */ + if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE)) + perror("ioctl"), exit(1); + /* Mmap buffer shared between kernel- and user-space. */ + cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if ((void*)cover == MAP_FAILED) + perror("mmap"), exit(1); + /* Enable coverage collection on the current thread. */ + if (ioctl(fd, KCOV_ENABLE, 0)) + perror("ioctl"), exit(1); + /* Reset coverage from the tail of the ioctl() call. */ + __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED); + /* That's the target syscal call. */ + read(-1, NULL, 0); + /* Read number of PCs collected. */ + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) + printf("0x%lx\n", cover[i + 1]); + /* Disable coverage collection for the current thread. After this call + * coverage can be enabled for a different thread. + */ + if (ioctl(fd, KCOV_DISABLE, 0)) + perror("ioctl"), exit(1); + /* Free resources. */ + if (munmap(cover, COVER_SIZE * sizeof(unsigned long))) + perror("munmap"), exit(1); + if (close(fd)) + perror("close"), exit(1); + return 0; +} + +After piping through addr2line output of the program looks as follows: + +SyS_read +fs/read_write.c:562 +__fdget_pos +fs/file.c:774 +__fget_light +fs/file.c:746 +__fget_light +fs/file.c:750 +__fget_light +fs/file.c:760 +__fdget_pos +fs/file.c:784 +SyS_read +fs/read_write.c:562 + +If a program needs to collect coverage from several threads (independently), +it needs to open /sys/kernel/debug/kcov in each thread separately. + +The interface is fine-grained to allow efficient forking of test processes. +That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode, +mmaps coverage buffer and then forks child processes in a loop. Child processes +only need to enable coverage (disable happens automatically on thread end). diff --git a/Makefile b/Makefile index bee6709222c224..51b56f9bf299f2 100644 --- a/Makefile +++ b/Makefile @@ -377,6 +377,7 @@ LDFLAGS_MODULE = CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage +CFLAGS_KCOV = -fsanitize-coverage=trace-pc # Use USERINCLUDE when you must reference the UAPI directories only. @@ -422,7 +423,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS -export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL @@ -671,6 +672,14 @@ endif endif KBUILD_CFLAGS += $(stackp-flag) +ifdef CONFIG_KCOV + ifeq ($(call cc-option, $(CFLAGS_KCOV)),) + $(warning Cannot use CONFIG_KCOV: \ + -fsanitize-coverage=trace-pc is not supported by compiler) + CFLAGS_KCOV = + endif +endif + ifeq ($(COMPILER),clang) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6bd602c273884..07d6ba9b3828a2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,12 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV if X86_64 + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_MMIO_FLUSH + select ARCH_HAS_SG_CHAIN + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 5b016e2498f3d3..5d69cdc37cfb7a 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,6 +9,16 @@ # Changed by many, many contributors over the years. # +KASAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y + +# Kernel does not boot with kcov instrumentation here. +# One of the problems observed was insertion of __sanitizer_cov_trace_pc() +# callback into middle of per-cpu data enabling code. Thus the callback observed +# inconsistent state and crashed. We are interested mostly in syscall coverage, +# so boot code is not interesting anyway. +KCOV_INSTRUMENT := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 45abc363dd3e44..1fba2a87cb8ca2 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,6 +4,11 @@ # create a compressed vmlinux image from the original vmlinux # +KASAN_SANITIZE := n + +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f1e77440b2bd6..4170d7da9659c5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,6 +16,16 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n + +# If instrumentation of this dir is enabled, boot hangs during first second. +# Probably could be more selective here, but note that files related to irqs, +# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to +# non-deterministic coverage. +KCOV_INSTRUMENT := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o entry_$(BITS).o diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index dcb5b15401ce80..60e67f91271c97 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,6 +2,10 @@ # Makefile for local APIC drivers and for the IO-APIC code # +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o obj-y += hw_nmi.o diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e27b49d7c922a3..0a237a1b90129e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_perf_event.o = -pg endif +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index db92793b7e23ed..d6377b7ea7bca1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,6 +2,9 @@ # Makefile for x86 specific library files. # +# Produces uninteresting flaky coverage. +KCOV_INSTRUMENT_delay.o := n + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 6a19ad9f370d1c..01d51017cf9d75 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,3 +1,6 @@ +# Kernel does not boot with instrumentation of tlb.c. +KCOV_INSTRUMENT_tlb.o := n + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o gup.o setup_nx.o diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 7c0d7be176a584..6662cc14224ee1 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -7,6 +7,9 @@ # # +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + always := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index b14bc2b9fb4df5..35985e9b8b6ce2 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -20,6 +20,9 @@ KBUILD_CFLAGS := $(cflags-y) \ GCOV_PROFILE := n +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + lib-y := efi-stub-helper.o lib-$(CONFIG_EFI_ARMSTUB) += arm-stub.o fdt.o diff --git a/include/linux/kcov.h b/include/linux/kcov.h new file mode 100644 index 00000000000000..2883ac98c280ca --- /dev/null +++ b/include/linux/kcov.h @@ -0,0 +1,29 @@ +#ifndef _LINUX_KCOV_H +#define _LINUX_KCOV_H + +#include + +struct task_struct; + +#ifdef CONFIG_KCOV + +void kcov_task_init(struct task_struct *t); +void kcov_task_exit(struct task_struct *t); + +enum kcov_mode { + /* Coverage collection is not enabled yet. */ + KCOV_MODE_DISABLED = 0, + /* + * Tracing coverage collection mode. + * Covered PCs are collected in a per-task buffer. + */ + KCOV_MODE_TRACE = 1, +}; + +#else + +static inline void kcov_task_init(struct task_struct *t) {} +static inline void kcov_task_exit(struct task_struct *t) {} + +#endif /* CONFIG_KCOV */ +#endif /* _LINUX_KCOV_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c8ab0f711896d5..cd4ea6abf7a233 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -51,6 +51,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1745,6 +1746,16 @@ struct task_struct { /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ +#ifdef CONFIG_KCOV + /* Coverage collection mode enabled for this task (0 if disabled). */ + enum kcov_mode kcov_mode; + /* Size of the kcov_area. */ + unsigned kcov_size; + /* Buffer for coverage collection. */ + void *kcov_area; + /* kcov desciptor wired with this task or NULL. */ + struct kcov *kcov; +#endif #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ unsigned int memcg_kmem_skip_account; struct memcg_oom_info { diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h new file mode 100644 index 00000000000000..574e22ec640dab --- /dev/null +++ b/include/uapi/linux/kcov.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_KCOV_IOCTLS_H +#define _LINUX_KCOV_IOCTLS_H + +#include + +#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) +#define KCOV_ENABLE _IO('c', 100) +#define KCOV_DISABLE _IO('c', 101) + +#endif /* _LINUX_KCOV_IOCTLS_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 17ea6d4a9a247c..fda5903dbcda90 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,6 +17,17 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -65,6 +76,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/exit.c b/kernel/exit.c index 582e70f35eeca3..c1fa33522e4bcb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "sched/tune.h" @@ -673,6 +674,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index bf4cfe95f3c50f..49496415c922b3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -370,6 +371,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) account_kernel_stack(ti, 1); + kcov_task_init(tsk); + return tsk; free_ti: diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 00000000000000..3efbee0834a85d --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,273 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232bb..dc968422e47721 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b335..8844d222242078 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index fdcdb2235274de..55bb7af371b65e 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = -pg endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 95e5931cd05c45..088880ee097591 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -637,6 +637,27 @@ source "lib/Kconfig.kmemcheck" endmenu # "Memory Debugging" +config ARCH_HAS_KCOV + bool + help + KCOV does not have any arch-specific code, but currently it is enabled + only for x86_64. KCOV requires testing on other archs, and most likely + disabling of instrumentation for some early boot code. + +config KCOV + bool "Code coverage for fuzzing" + depends on ARCH_HAS_KCOV + select DEBUG_FS + help + KCOV exposes kernel code coverage information in a form suitable + for coverage-guided fuzzing (randomized testing). + + If RANDOMIZE_BASE is enabled, PC values will not be stable across + different machines and across reboots. If you need stable PC values, + disable RANDOMIZE_BASE. + + For more details, see Documentation/kcov.txt. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 0211d2bd5e1755..6ded47f8c3ffbb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) endif +# These files are disabled because they produce lots of non-interesting and/or +# flaky coverage that is not a function of syscall inputs. For example, +# rbtree can be global and individual rotations don't correlate with inputs. +KCOV_INSTRUMENT_string.o := n +KCOV_INSTRUMENT_rbtree.o := n +KCOV_INSTRUMENT_list_debug.o := n +KCOV_INSTRUMENT_debugobjects.o := n +KCOV_INSTRUMENT_dynamic_debug.o := n +# Kernel does not boot if we instrument this file as it uses custom calling +# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). +KCOV_INSTRUMENT_hweight.o := n + lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ diff --git a/mm/Makefile b/mm/Makefile index 8405eb0023a918..9928fbe3feee59 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -2,6 +2,24 @@ # Makefile for the linux memory manager. # +KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slub.o := n + +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile new file mode 100644 index 00000000000000..131daadf40e471 --- /dev/null +++ b/mm/kasan/Makefile @@ -0,0 +1,10 @@ +KASAN_SANITIZE := n +UBSAN_SANITIZE_kasan.o := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_kasan.o = -pg +# Function splitter causes unnecessary splits in __asan_load1/__asan_store1 +# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 +CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) + +obj-y := kasan.o report.o kasan_init.o diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 5f7772fcc953fb..221f1a36bf0928 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -119,6 +119,28 @@ _c_flags += $(if $(patsubst n%,, \ $(CFLAGS_GCOV)) endif +# +# Enable address sanitizer flags for kernel except some files or directories +# we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE) +# +ifeq ($(CONFIG_KASAN),y) +_c_flags += $(if $(patsubst n%,, \ + $(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)$(CONFIG_KASAN_SANITIZE_ALL)), \ + $(CFLAGS_KASAN)) +endif + +ifeq ($(CONFIG_UBSAN),y) +_c_flags += $(if $(patsubst n%,, \ + $(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \ + $(CFLAGS_UBSAN)) +endif + +ifeq ($(CONFIG_KCOV),y) +_c_flags += $(if $(patsubst n%,, \ + $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \ + $(CFLAGS_KCOV)) +endif + # If building the kernel in a separate objtree expand all occurrences # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/'). From a2fabd7afd09d14b55fb8da86893f49478a70fc6 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 3 Jun 2015 14:48:29 +0100 Subject: [PATCH 267/420] arm64: defconfig: add few misc configs for development/debugging/testing Signed-off-by: Sudeep Holla --- arch/arm64/configs/defconfig | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index dd301be89ecccf..f8697c84a57f92 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -46,6 +46,22 @@ CONFIG_CMA=y CONFIG_CMDLINE="console=ttyAMA0" # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_COMPAT=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_CPU_IDLE=y +CONFIG_ARM_CPUIDLE=y +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_STAT_DETAILS=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPUFREQ_DT=y +CONFIG_ARM_BIG_LITTLE_CPUFREQ=y +CONFIG_ARM_SCPI_CPUFREQ=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -65,6 +81,7 @@ CONFIG_DEVTMPFS_MOUNT=y CONFIG_DMA_CMA=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y +CONFIG_SRAM=y # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_LOWLEVEL is not set @@ -98,6 +115,8 @@ CONFIG_SPI_PL022=y CONFIG_GPIO_PL061=y CONFIG_GPIO_XGENE=y # CONFIG_HWMON is not set +CONFIG_SENSORS_ARM_SCPI=y +CONFIG_SENSORS_V2M_JUNO=y CONFIG_REGULATOR=y CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_FB=y @@ -121,11 +140,20 @@ CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SPI=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y +CONFIG_RTC_DRV_PL030=y +CONFIG_RTC_DRV_PL031=y CONFIG_RTC_DRV_XGENE=y CONFIG_VIRTIO_BALLOON=y CONFIG_VIRTIO_MMIO=y +CONFIG_COMMON_CLK_SCPI=y +CONFIG_ARM_TIMER_SP804=y +CONFIG_MAILBOX=y +CONFIG_ARM_MHU=y # CONFIG_IOMMU_SUPPORT is not set CONFIG_PHY_XGENE=y +CONFIG_ARM_SCPI_PROTOCOL=y +CONFIG_DMI_SYSFS=y +CONFIG_EFI_VARS=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set @@ -146,8 +174,13 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y CONFIG_VIRTUALIZATION=y CONFIG_KVM=y +CONFIG_PRINTK_TIME=y +CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_REDUCED=y CONFIG_DEBUG_FS=y +CONFIG_HEADERS_CHECK=y +CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_LOCKUP_DETECTOR=y From cb02910753a91df9933b8157baebd876e2f22894 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 29 Jul 2016 17:50:11 +0100 Subject: [PATCH 268/420] sched/fair: Favor higher cpus only for boosted tasks This CL separates the notion of boost and prefer_idle schedtune attributes in cpu selection. Today only top-app tasks are boosted. The CPU selection is slightly tweaked such that higher order cpus are preferred only for boosted tasks (top-app) and the rest would be skewed towards lower order cpus. This avoids starvation issues for fg tasks when interacting with high priority top-app tasks (a problem often seen in the case of system_server). bug: 30245369 bug: 30292998 Change-Id: I0377e00893b9f6586eec55632a265518fd2fa8a1 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95bba5310763b2..27928d7757c294 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5417,7 +5417,7 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } -static inline int find_best_target(struct task_struct *p, bool prefer_idle) +static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5435,9 +5435,9 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) int idle_idx; /* - * favor higher cpus for tasks that prefer idle cores + * Iterate from higher cpus for boosted tasks. */ - int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5607,7 +5607,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) bool boosted = 0; bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted || prefer_idle); + int tmp_target = find_best_target(p, boosted, prefer_idle); if (tmp_target >= 0) { target_cpu = tmp_target; if ((boosted || prefer_idle) && idle_cpu(target_cpu)) From e015a8b2527a37fb57a0dc23e06f431377372fb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:47 +0200 Subject: [PATCH 269/420] sched/wait: Provide infrastructure to deal with nested blocking [ Upstream commit 61ada528dea028331e99e8ceaed87c683ad25de2 ] There are a few places that call blocking primitives from wait loops, provide infrastructure to support this without the typical task_struct::state collision. We record the wakeup in wait_queue_t::flags which leaves task_struct::state free to be used by others. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140924082242.051202318@infradead.org Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- include/linux/wait.h | 7 ++++- kernel/sched/wait.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index e4a8eb9312eabb..fc0e99395fbb36 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -13,9 +13,12 @@ typedef struct __wait_queue wait_queue_t; typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); +/* __wait_queue::flags */ +#define WQ_FLAG_EXCLUSIVE 0x01 +#define WQ_FLAG_WOKEN 0x02 + struct __wait_queue { unsigned int flags; -#define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; @@ -830,6 +833,8 @@ void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int sta long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 3f6ef481a1a596..f7b22adb47f741 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -297,6 +297,67 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN)) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; From b4dc91c1166d8cf5d3c61c235c8b889985b442ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 29 Nov 2014 08:13:51 -0800 Subject: [PATCH 270/420] sched: Add missing rcu protection to wake_up_all_idle_cpus commit fd7de1e8d5b2b2b35e71332fafb899f584597150 upstream. Locklessly doing is_idle_task(rq->curr) is only okay because of RCU protection. The older variant of the broken code checked rq->curr == rq->idle instead and therefore didn't need RCU. Fixes: f6be8af1c95d ("sched: Add new API wake_up_if_idle() to wake up the idle cpu") Signed-off-by: Andy Lutomirski Reviewed-by: Chuansheng Liu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/729365dddca178506dfd0a9451006344cd6808bc.1417277372.git.luto@amacapital.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ce82c8bf29bae0..a73b8ee42f84d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1840,8 +1840,10 @@ void wake_up_if_idle(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!is_idle_task(rq->curr)) - return; + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); @@ -1852,6 +1854,9 @@ void wake_up_if_idle(int cpu) /* Else cpu is not in idle, do nothing here */ raw_spin_unlock_irqrestore(&rq->lock, flags); } + +out: + rcu_read_unlock(); } bool cpus_share_cache(int this_cpu, int that_cpu) From e2371b0f0bcb44c85b529985153d26152904f149 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 14:45:09 +0200 Subject: [PATCH 271/420] sched/core: Fix TASK_DEAD race in finish_task_switch() [ Upstream commit 95913d97914f44db2b81271c2e2ebd4d2ac2df83 ] So the problem this patch is trying to address is as follows: CPU0 CPU1 context_switch(A, B) ttwu(A) LOCK A->pi_lock A->on_cpu == 0 finish_task_switch(A) prev_state = A->state <-. WMB | A->on_cpu = 0; | UNLOCK rq0->lock | | context_switch(C, A) `-- A->state = TASK_DEAD prev_state == TASK_DEAD put_task_struct(A) context_switch(A, C) finish_task_switch(A) A->state == TASK_DEAD put_task_struct(A) The argument being that the WMB will allow the load of A->state on CPU0 to cross over and observe CPU1's store of A->state, which will then result in a double-drop and use-after-free. Now the comment states (and this was true once upon a long time ago) that we need to observe A->state while holding rq->lock because that will order us against the wakeup; however the wakeup will not in fact acquire (that) rq->lock; it takes A->pi_lock these days. We can obviously fix this by upgrading the WMB to an MB, but that is expensive, so we'd rather avoid that. The alternative this patch takes is: smp_store_release(&A->on_cpu, 0), which avoids the MB on some archs, but not important ones like ARM. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: # v3.1+ Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: manfred@colorfullife.com Cc: will.deacon@arm.com Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()") Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- kernel/sched/core.c | 10 +++++----- kernel/sched/sched.h | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a73b8ee42f84d1..463106e6549a50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2503,11 +2503,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6a06df0bff9d38..8e37ab8dbd09ef 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1073,9 +1073,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ From 073007402ce5cd4f03e3ff816435e7d700460400 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 18 Sep 2015 11:27:45 +0200 Subject: [PATCH 272/420] sched: access local runqueue directly in single_task_running [ Upstream commit 00cc1633816de8c95f337608a1ea64e228faf771 ] Commit 2ee507c47293 ("sched: Add function single_task_running to let a task check if it is the only task running on a cpu") referenced the current runqueue with the smp_processor_id. When CONFIG_DEBUG_PREEMPT is enabled, that is only allowed if preemption is disabled or the currrent task is bound to the local cpu (e.g. kernel worker). With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that generates a lot of kernel messages. To avoid adding preemption in that cases, as it would limit the usefulness, we change single_task_running to access directly the cpu local runqueue. Cc: Tim Chen Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Cc: Fixes: 2ee507c472939db4b146d545352b8a7c79ef47f8 Signed-off-by: Dominik Dingel Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- kernel/sched/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 463106e6549a50..7c4e0060ed6d23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2651,13 +2651,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); From c368f06f9f22bab9e1ba74faa4de33d82c043337 Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Wed, 18 Feb 2015 16:23:56 -0800 Subject: [PATCH 273/420] sched: Fix RLIMIT_RTTIME when PI-boosting to RT [ Upstream commit 746db9443ea57fd9c059f62c4bfbf41cf224fe13 ] When non-realtime tasks get priority-inheritance boosted to a realtime scheduling class, RLIMIT_RTTIME starts to apply to them. However, the counter used for checking this (the same one used for SCHED_RR timeslices) was not getting reset. This meant that tasks running with a non-realtime scheduling class which are repeatedly boosted to a realtime one, but never block while they are running realtime, eventually hit the timeout without ever running for a time over the limit. This patch resets the realtime timeslice counter when un-PI-boosting from an RT to a non-RT scheduling class. I have some test code with two threads and a shared PTHREAD_PRIO_INHERIT mutex which induces priority boosting and spins while boosted that gets killed by a SIGXCPU on non-fixed kernels but doesn't with this patch applied. It happens much faster with a CONFIG_PREEMPT_RT kernel, and does happen eventually with PREEMPT_VOLUNTARY kernels. Signed-off-by: Brian Silverman Signed-off-by: Peter Zijlstra (Intel) Cc: austin@peloton-tech.com Cc: Link: http://lkml.kernel.org/r/1424305436-6716-1-git-send-email-brian@peloton-tech.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c4e0060ed6d23..88edabaa02335e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3478,6 +3478,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } From 43eb420f66d83e2d6bcc9cf9e1320019a3b4eaa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 19:49:49 +0200 Subject: [PATCH 274/420] sched: Handle priority boosted tasks proper in setscheduler() [ Upstream commit 0782e63bc6fe7e2d3408d250df11d388b7799c6b ] Ronny reported that the following scenario is not handled correctly: T1 (prio = 10) lock(rtmutex); T2 (prio = 20) lock(rtmutex) boost T1 T1 (prio = 20) sys_set_scheduler(prio = 30) T1 prio = 30 .... sys_set_scheduler(prio = 10) T1 prio = 30 The last step is wrong as T1 should now be back at prio 20. Commit c365c292d059 ("sched: Consider pi boosting in setscheduler()") only handles the case where a boosted tasks tries to lower its priority. Fix it by taking the new effective priority into account for the decision whether a change of the priority is required. Reported-by: Ronny Meeus Tested-by: Steven Rostedt Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Mike Galbraith Fixes: c365c292d059 ("sched: Consider pi boosting in setscheduler()") Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1505051806060.4225@nanos Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- include/linux/sched/rt.h | 7 ++++--- kernel/locking/rtmutex.c | 12 +++++++----- kernel/sched/core.c | 26 ++++++++++++++------------ 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 6341f5be6e2474..a30b172df6e1a7 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -18,7 +18,7 @@ static inline int rt_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); -extern int rt_mutex_check_prio(struct task_struct *task, int newprio); +extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio); extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) @@ -31,9 +31,10 @@ static inline int rt_mutex_getprio(struct task_struct *p) return p->normal_prio; } -static inline int rt_mutex_check_prio(struct task_struct *task, int newprio) +static inline int rt_mutex_get_effective_prio(struct task_struct *task, + int newprio) { - return 0; + return newprio; } static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7c98873a30777f..5e0de35e693963 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) } /* - * Called by sched_setscheduler() to check whether the priority change - * is overruled by a possible priority boosting. + * Called by sched_setscheduler() to get the priority which will be + * effective after the change. */ -int rt_mutex_check_prio(struct task_struct *task, int newprio) +int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) { if (!task_has_pi_waiters(task)) - return 0; + return newprio; - return task_top_pi_waiter(task)->task->prio <= newprio; + if (task_top_pi_waiter(task)->task->prio <= newprio) + return task_top_pi_waiter(task)->task->prio; + return newprio; } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88edabaa02335e..f21184098f8a55 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3724,15 +3724,18 @@ static void __setscheduler_params(struct task_struct *p, /* Actually do priority change: must hold pi & rq lock. */ static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) + const struct sched_attr *attr, bool keep_boost) { __setscheduler_params(p, attr); /* - * If we get here, there was no pi waiters boosting the - * task. It is safe to use the normal prio. + * Keep a potential priority boosting if called from + * sched_setscheduler(). */ - p->prio = normal_prio(p); + if (keep_boost) + p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); + else + p->prio = normal_prio(p); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3818,7 +3821,7 @@ static int __sched_setscheduler(struct task_struct *p, int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; - int policy = attr->sched_policy; + int new_effective_prio, policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -4000,15 +4003,14 @@ static int __sched_setscheduler(struct task_struct *p, oldprio = p->prio; /* - * Special case for priority boosted tasks. - * - * If the new priority is lower or equal (user space view) - * than the current (boosted) priority, we just store the new + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new * normal parameters and do not touch the scheduler class and * the runqueue. This will be done when the task deboost * itself. */ - if (rt_mutex_check_prio(p, newprio)) { + new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + if (new_effective_prio == oldprio) { __setscheduler_params(p, attr); task_rq_unlock(rq, p, &flags); return 0; @@ -4022,7 +4024,7 @@ static int __sched_setscheduler(struct task_struct *p, put_prev_task(rq, p); prev_class = p->sched_class; - __setscheduler(rq, p, attr); + __setscheduler(rq, p, attr, true); if (running) p->sched_class->set_curr_task(rq); @@ -7624,7 +7626,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) queued = task_on_rq_queued(p); if (queued) dequeue_task(rq, p, 0); - __setscheduler(rq, p, &attr); + __setscheduler(rq, p, &attr, false); if (queued) { enqueue_task(rq, p, 0); resched_curr(rq); From d21f086c1245a2e46515044647ac1dcdd71749fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:34 +0200 Subject: [PATCH 275/420] UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity() Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it: __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the bug by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org Signed-off-by: Ingo Molnar (cherry picked from commit 25834c73f93af7f0712c98ca4593691592e6b360) Signed-off-by: Punit Agrawal BUG=chrome-os-partner:44828 TEST=Boot kernel on Oak. TEST=smaug-release and strago-release trybots. Change-Id: Id3c898c5ee1a22ed704e83f2ecf5f78199280d38 Reviewed-on: https://chromium-review.googlesource.com/321264 Commit-Ready: Ricky Liang Tested-by: Ricky Liang Reviewed-by: Ricky Liang Conflicts: kernel/sched/core.c --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f21184098f8a55..37e5f3aceeac93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5026,6 +5026,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock(&rq->lock); __sched_fork(0, idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); From b9de09de4c8a99ca853bd4ae9b423f684e95905d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Wed, 12 Aug 2015 21:35:56 +0200 Subject: [PATCH 276/420] BACKPORT: sched: Fix cpu_active_mask/cpu_online_mask race There is a race condition in SMP bootup code, which may result in WARNING: CPU: 0 PID: 1 at kernel/workqueue.c:4418 workqueue_cpu_up_callback() or kernel BUG at kernel/smpboot.c:135! It can be triggered with a bit of luck in Linux guests running on busy hosts. CPU0 CPUn ==== ==== _cpu_up() __cpu_up() start_secondary() set_cpu_online() cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); cpu_notify(CPU_ONLINE) cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); During the various CPU_ONLINE callbacks CPUn is online but not active. Several things can go wrong at that point, depending on the scheduling of tasks on CPU0. Variant 1: cpu_notify(CPU_ONLINE) workqueue_cpu_up_callback() rebind_workers() set_cpus_allowed_ptr() This call fails because it requires an active CPU; rebind_workers() ends with a warning: WARNING: CPU: 0 PID: 1 at kernel/workqueue.c:4418 workqueue_cpu_up_callback() Variant 2: cpu_notify(CPU_ONLINE) smpboot_thread_call() smpboot_unpark_threads() .. __kthread_unpark() __kthread_bind() wake_up_state() .. select_task_rq() select_fallback_rq() The ->wake_cpu of the unparked thread is not allowed, making a call to select_fallback_rq() necessary. Then, select_fallback_rq() cannot find an allowed, active CPU and promptly resets the allowed CPUs, so that the task in question ends up on CPU0. When those unparked tasks are eventually executed, they run immediately into a BUG: kernel BUG at kernel/smpboot.c:135! Just changing the order in which the online/active bits are set (and adding some memory barriers), would solve the two issues above. However, it would change the order of operations back to the one before commit 6acbfb96976f ("sched: Fix hotplug vs. set_cpus_allowed_ptr()"), thus, reintroducing that particular problem. Going further back into history, we have at least the following commits touching this topic: - commit 2baab4e90495 ("sched: Fix select_fallback_rq() vs cpu_active/cpu_online") - commit 5fbd036b552f ("sched: Cleanup cpu_active madness") Together, these give us the following non-working solutions: - secondary CPU sets active before online, because active is assumed to be a subset of online; - secondary CPU sets online before active, because the primary CPU assumes that an online CPU is also active; - secondary CPU sets online and waits for primary CPU to set active, because it might deadlock. Commit 875ebe940d77 ("powerpc/smp: Wait until secondaries are active & online") introduces an arch-specific solution to this arch-independent problem. Now, go for a more general solution without explicit waiting and simply set active twice: once on the secondary CPU after online was set and once on the primary CPU after online was seen. set_cpus_allowed_ptr()") Signed-off-by: Jan H. Schnherr Acked-by: Peter Zijlstra Cc: Cc: Anton Blanchard Cc: Borislav Petkov Cc: Joerg Roedel Cc: Linus Torvalds Cc: Matt Wilson Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6acbfb96976f ("sched: Fix hotplug vs. set_cpus_allowed_ptr()") Link: http://lkml.kernel.org/r/1439408156-18840-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar (cherry picked from commit dd9d3843755da95f63dd3a376f62b3e45c011210) BUG=chromium:583821 TEST=built Change-Id: I3b891bc2a6fbf060bdcb4efe4e33e2a821003bff Reviewed-on: https://chromium-review.googlesource.com/325509 Commit-Ready: Aditya Kali Tested-by: Aditya Kali Reviewed-by: Kevin Cernekee --- kernel/sched/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 37e5f3aceeac93..1a9dea2f10b92e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5574,6 +5574,14 @@ static int sched_cpu_active(struct notifier_block *nfb, case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: + /* + * At this point a starting CPU has marked itself as online via + * set_cpu_online(). But it might not yet have marked itself + * as active, which is essential from here on. + * + * Thus, fall-through and help the starting CPU along. + */ case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; From 2358ffca19b61e096e435967e5669d82c0735e98 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 17 Dec 2014 11:50:32 +0100 Subject: [PATCH 277/420] sched/deadline: Avoid double-accounting in case of missed deadlines commit 269ad8015a6b2bb1cf9e684da4921eb6fa0a0c88 upstream. The dl_runtime_exceeded() function is supposed to ckeck if a SCHED_DEADLINE task must be throttled, by checking if its current runtime is <= 0. However, it also checks if the scheduling deadline has been missed (the current time is larger than the current scheduling deadline), further decreasing the runtime if this happens. This "double accounting" is wrong: - In case of partitioned scheduling (or single CPU), this happens if task_tick_dl() has been called later than expected (due to small HZ values). In this case, the current runtime is also negative, and replenish_dl_entity() can take care of the deadline miss by recharging the current runtime to a value smaller than dl_runtime - In case of global scheduling on multiple CPUs, scheduling deadlines can be missed even if the task did not consume more runtime than expected, hence penalizing the task is wrong This patch fix this problem by throttling a SCHED_DEADLINE task only when its runtime becomes negative, and not modifying the runtime Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Dario Faggioli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418813432-20797-3-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/deadline.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index afb4df8f8655f6..d3cbb7bcc66e7b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -705,24 +705,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); From 81b93577ea08e79db8cc3a1fa3dd9feed797c687 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 17 Dec 2014 11:50:31 +0100 Subject: [PATCH 278/420] sched/deadline: Fix migration of SCHED_DEADLINE tasks commit 6a503c3be937d275113b702e0421e5b0720abe8a upstream. According to global EDF, tasks should be migrated between runqueues without checking if their scheduling deadlines and runtimes are valid. However, SCHED_DEADLINE currently performs such a check: a migration happens doing: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); which ends up calling dequeue_task_dl(), setting the new CPU, and then calling enqueue_task_dl(). enqueue_task_dl() then calls enqueue_dl_entity(), which calls update_dl_entity(), which can modify scheduling deadline and runtime, breaking global EDF scheduling. As a result, some of the properties of global EDF are not respected: for example, a taskset {(30, 80), (40, 80), (120, 170)} scheduled on two cores can have unbounded response times for the third task even if 30/80+40/80+120/170 = 1.5809 < 2 This can be fixed by invoking update_dl_entity() only in case of wakeup, or if this is a new SCHED_DEADLINE task. Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Dario Faggioli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1418813432-20797-2-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/deadline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d3cbb7bcc66e7b..a2d36e82ef0054 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -941,10 +941,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } From ba49f823c0a9a31643710381039021cd3fdb7823 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 18 Jun 2015 16:54:44 -0700 Subject: [PATCH 279/420] sched, numa: Do not hint for NUMA balancing on VM_MIXEDMAP mappings commit 8e76d4eecf7afeec9328e21cd5880e281838d0d6 upstream. Jovi Zhangwei reported the following problem Below kernel vm bug can be triggered by tcpdump which mmaped a lot of pages with GFP_COMP flag. [Mon May 25 05:29:33 2015] page:ffffea0015414000 count:66 mapcount:1 mapping: (null) index:0x0 [Mon May 25 05:29:33 2015] flags: 0x20047580004000(head) [Mon May 25 05:29:33 2015] page dumped because: VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page)) [Mon May 25 05:29:33 2015] ------------[ cut here ]------------ [Mon May 25 05:29:33 2015] kernel BUG at mm/migrate.c:1661! [Mon May 25 05:29:33 2015] invalid opcode: 0000 [#1] SMP In this case it was triggered by running tcpdump but it's not necessary reproducible on all systems. sudo tcpdump -i bond0.100 'tcp port 4242' -c 100000000000 -w 4242.pcap Compound pages cannot be migrated and it was not expected that such pages be marked for NUMA balancing. This did not take into account that drivers such as net/packet/af_packet.c may insert compound pages into userspace with vm_insert_page. This patch tells the NUMA balancing protection scanner to skip all VM_MIXEDMAP mappings which avoids the possibility that compound pages are marked for migration. Signed-off-by: Mel Gorman Reported-by: Jovi Zhangwei Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [jovi: Backported to 3.18: adjust context] Signed-off-by: Jovi Zhangwei Signed-off-by: Sasha Levin --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27928d7757c294..59d536660a57fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2001,8 +2001,10 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; + } /* * Shared library pages mapped by multiple processes are not From a7fd76d309e067e57ee6f787bb4e57fb8165bf1d Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Mon, 6 Apr 2015 15:28:10 -0700 Subject: [PATCH 280/420] sched/fair: Prevent throttling in early pick_next_task_fair() [ Upstream commit 54d27365cae88fbcc853b391dcd561e71acb81fa ] The optimized task selection logic optimistically selects a new task to run without first doing a full put_prev_task(). This is so that we can avoid a put/set on the common ancestors of the old and new task. Similarly, we should only call check_cfs_rq_runtime() to throttle eligible groups if they're part of the common ancestry, otherwise it is possible to end up with no eligible task in the simple task selection. Imagine: /root /prev /next /A /B If our optimistic selection ends up throttling /next, we goto simple and our put_prev_task() ends up throttling /prev, after which we're going to bug out in set_next_entity() because there aren't any tasks left. Avoid this scenario by only throttling common ancestors. Reported-by: Mohammed Naser Reported-by: Konstantin Khlebnikov Signed-off-by: Ben Segall [ munged Changelog ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Thomas Gleixner Cc: pjt@google.com Fixes: 678d5718d8d0 ("sched/fair: Optimize cgroup pick_next_task_fair()") Link: http://lkml.kernel.org/r/xm26wq1oswoq.fsf@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- kernel/sched/fair.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59d536660a57fc..491ef61a6dd5d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5961,18 +5961,21 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) * entity, update_curr() will update its vruntime, otherwise * forget we've ever seen it. */ - if (curr && curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; - /* - * This call to check_cfs_rq_runtime() will do the throttle and - * dequeue its entity in the parent(s). Therefore the 'simple' - * nr_running test will indeed be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto simple; + /* + * This call to check_cfs_rq_runtime() will do the + * throttle and dequeue its entity in the parent(s). + * Therefore the 'simple' nr_running test will indeed + * be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + } se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); From efddf50c4ef59d651967ba4e2032c9e99ff24134 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Tue, 17 Jun 2014 21:43:35 -0700 Subject: [PATCH 281/420] sched/rt: print RT tasks when RT throttling is activated Existing debug prints do not provide any clues about which tasks may have triggered RT throttling. Print the names and PIDs of all tasks on the throttled rt_rq to help narrow down the source of the problem. Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa --- kernel/sched/rt.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 14a5c006bea77c..ed601723663101 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -873,6 +873,42 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *rt_se; + char buf[500]; + char *pos = buf; + char *end = buf + sizeof(buf); + int idx; + + pos += snprintf(pos, sizeof(buf), + "sched: RT throttling activated for rt_rq %p (cpu %d)\n", + rt_rq, cpu_of(rq_of_rt_rq(rt_rq))); + + if (bitmap_empty(array->bitmap, MAX_RT_PRIO)) + goto out; + + pos += snprintf(pos, end - pos, "potential CPU hogs:\n"); + idx = sched_find_first_bit(array->bitmap); + while (idx < MAX_RT_PRIO) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pos < end) + pos += snprintf(pos, end - pos, "\t%s (%d)\n", + p->comm, p->pid); + } + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + } +out: + printk_deferred("%s", buf); +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -896,8 +932,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + + if (!once) { + once = true; + dump_throttled_rt_tasks(rt_rq); + } } else { /* * In case we did anyway, make it go away, From b92472cd7902f045b6732b03162fd640bcd5c076 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Thu, 19 Jun 2014 14:23:33 -0700 Subject: [PATCH 282/420] sched/rt: Add Kconfig option to enable panicking for RT throttling This may be useful for detecting and debugging RT throttling issues. Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9 Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa --- kernel/sched/rt.c | 9 +++++++++ lib/Kconfig.debug | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ed601723663101..24ad9f1deb83f7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -906,7 +906,16 @@ static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); } out: +#ifdef CONFIG_PANIC_ON_RT_THROTTLING + /* + * Use pr_err() in the BUG() case since printk_sched() will + * not get flushed and deadlock is not a concern. + */ + pr_err("%s", buf); + BUG(); +#else printk_deferred("%s", buf); +#endif } static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 088880ee097591..75d24da2c892d7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -845,6 +845,15 @@ config SCHED_DEBUG that can help debug the scheduler. The runtime overhead of this option is minimal. +config PANIC_ON_RT_THROTTLING + bool "Panic on RT throttling" + help + Say Y here to enable the kernel to panic when a realtime + runqueue is throttled. This may be useful for detecting + and debugging RT throttling issues. + + Say N if unsure. + config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS From cfd4c9e30bd135a919b877e98d4216ecd80f089a Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 12 Aug 2016 11:24:50 +0530 Subject: [PATCH 283/420] ANDROID: net: fib: remove duplicate assignment Remove duplicate FRA_GOTO assignment and fix one whitespace error. Fixes: ba3d8d3f9f65 ("net: core: Support UID-based routing.") Change-Id: I462c24b16fdef42ae2332571a0b95de3ef9d2e25 Signed-off-by: Amit Pundir --- include/net/fib_rules.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index c10c06f8e59d4c..35e065b35e0db9 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -88,13 +88,11 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_UID_START] = { .type = NLA_U32 }, \ [FRA_UID_END] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 } - static inline void fib_rule_get(struct fib_rule *rule) { From b0293a8cb7194977b1b4a0c558d99013fdf3b8b5 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:29 -0400 Subject: [PATCH 284/420] UPSTREAM: Revert "ecryptfs: forbid opening files without mmap handler" (cherry picked from commit 78c4e172412de5d0456dc00d2b34050aa0b683b5) This reverts upstream commit 2f36db71009304b3f0b95afacd8eba1f9f046b87. It fixed a local root exploit but also introduced a dependency on the lower file system implementing an mmap operation just to open a file, which is a bit of a heavy hammer. The right fix is to have mmap depend on the existence of the mmap handler instead. Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org Signed-off-by: Tyler Hicks Fixes: Change-Id I0be77c7f8bd3046bc34cd87ef577529792d479bc ("UPSTREAM: ecryptfs: forbid opening files without mmap handler") Change-Id: Ib9bc87099f7f89e4e12dbc1a79e884dcadb1befb Signed-off-by: Amit Pundir --- fs/ecryptfs/kthread.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 9b661a4ccee739..f1ea610362c6c1 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto have_file; + goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) { + if (IS_ERR(*lower_file)) rc = PTR_ERR(*lower_file); - goto out; - } -have_file: - if ((*lower_file)->f_op->mmap == NULL) { - fput(*lower_file); - *lower_file = NULL; - rc = -EMEDIUMTYPE; - } out: return rc; } From b82e09bcbe89190e26152a97de74fb0107c06e9b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:30 -0400 Subject: [PATCH 285/420] UPSTREAM: ecryptfs: don't allow mmap when the lower fs doesn't support it (cherry picked from commit f0fe970df3838c202ef6c07a4c2b36838ef0a88b) There are legitimate reasons to disallow mmap on certain files, notably in sysfs or procfs. We shouldn't emulate mmap support on file systems that don't offer support natively. CVE-2016-1583 Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org [tyhicks: clean up f_op check by using ecryptfs_file_to_lower()] Signed-off-by: Tyler Hicks Change-Id: I66e3670771630a25b0608f10019d1584e9ce73a6 Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index f5bce9096555a6..644dc16fa11a48 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -177,6 +177,19 @@ static int read_or_initialize_metadata(struct dentry *dentry) return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode speciying file to open @@ -360,7 +373,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, From dacc51909f9652be86deb33ea211d3995fbf580f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 4 May 2016 14:04:13 -0400 Subject: [PATCH 286/420] BACKPORT: ecryptfs: fix handling of directory opening (cherry picked from commit 6a480a7842545ec520a91730209ec0bae41694c1) First of all, trying to open them r/w is idiocy; it's guaranteed to fail. Moreover, assigning ->f_pos and assuming that everything will work is blatantly broken - try that with e.g. tmpfs as underlying layer and watch the fireworks. There may be a non-trivial amount of state associated with current IO position, well beyond the numeric offset. Using the single struct file associated with underlying inode is really not a good idea; we ought to open one for each ecryptfs directory struct file. Additionally, file_operations both for directories and non-directories are full of pointless methods; non-directories should *not* have ->iterate(), directories should not have ->flush(), ->fasync() and ->splice_read(). Signed-off-by: Al Viro Change-Id: I4813ce803f270fdd364758ce1dc108b76eab226e Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 644dc16fa11a48..144236b41715a7 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -119,7 +119,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -255,14 +254,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -279,6 +270,45 @@ static int ecryptfs_open(struct inode *inode, struct file *file) return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -299,6 +329,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -353,13 +396,10 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { @@ -368,7 +408,6 @@ const struct file_operations ecryptfs_main_fops = { .read_iter = ecryptfs_read_update_atime, .write = new_sync_write, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, From 9e5b3354333f8e8c760acb4cc260d990875d4d28 Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Wed, 27 Jul 2016 10:03:19 +0800 Subject: [PATCH 287/420] UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget There may be a race condition if f_fs calls unregister_gadget_item in ffs_closed() when unregister_gadget is called by UDC store at the same time. this leads to a kernel NULL pointer dereference: [ 310.644928] Unable to handle kernel NULL pointer dereference at virtual address 00000004 [ 310.645053] init: Service 'adbd' is being killed... [ 310.658938] pgd = c9528000 [ 310.662515] [00000004] *pgd=19451831, *pte=00000000, *ppte=00000000 [ 310.669702] Internal error: Oops: 817 [#1] PREEMPT SMP ARM [ 310.675211] Modules linked in: [ 310.678294] CPU: 0 PID: 1537 Comm: ->transport Not tainted 4.1.15-03725-g793404c #2 [ 310.685958] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 310.692493] task: c8e24200 ti: c945e000 task.ti: c945e000 [ 310.697911] PC is at usb_gadget_unregister_driver+0xb4/0xd0 [ 310.703502] LR is at __mutex_lock_slowpath+0x10c/0x16c [ 310.708648] pc : [] lr : [] psr: 600f0113 [ 311.565585] [] (usb_gadget_unregister_driver) from [] (unregister_gadget_item+0x1c/0x34) [ 311.575426] [] (unregister_gadget_item) from [] (ffs_closed+0x8c/0x9c) [ 311.583702] [] (ffs_closed) from [] (ffs_data_reset+0xc/0xa0) [ 311.591194] [] (ffs_data_reset) from [] (ffs_data_closed+0x90/0xd0) [ 311.599208] [] (ffs_data_closed) from [] (ffs_ep0_release+0xc/0x14) [ 311.607224] [] (ffs_ep0_release) from [] (__fput+0x80/0x1d0) [ 311.614635] [] (__fput) from [] (task_work_run+0xb0/0xe8) [ 311.621788] [] (task_work_run) from [] (do_work_pending+0x7c/0xa4) [ 311.629718] [] (do_work_pending) from [] (work_pending+0xc/0x20) for functions using functionFS, i.e. android adbd will close /dev/usb-ffs/adb/ep0 when usb IO thread fails, but switch adb from on to off also triggers write "none" > UDC. These 2 operations both call unregister_gadget, which will lead to the panic above. add a mutex before calling unregister_gadget for api used in f_fs. Signed-off-by: Winter Wang Signed-off-by: Felipe Balbi --- drivers/usb/gadget/configfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index bcedccd603377a..3cfd6206309bed 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1800,7 +1800,9 @@ void unregister_gadget_item(struct config_item *item) { struct gadget_info *gi = to_gadget_info(item); + mutex_lock(&gi->lock); unregister_gadget(gi); + mutex_unlock(&gi->lock); } EXPORT_SYMBOL_GPL(unregister_gadget_item); From 5f4f2cbe882813ee862361f39aada407121b9c6a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: [PATCH 288/420] BACKPORT: tcp: make challenge acks less predictable (cherry picked from commit 75ff39ccc1bd5d3c455b6822ab09e533c551f758) Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Change-Id: Ib46ba66f5e4a5a7c81bfccd7b0aa83c3d9e1b3bb Bug: 30809774 --- net/ipv4/tcp_input.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e05046d12ed5e3..fa027a44d54cd9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -88,7 +88,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3325,12 +3325,18 @@ static void tcp_send_challenge_ack(struct sock *sk) static u32 challenge_timestamp; static unsigned int challenge_count; u32 now = jiffies / HZ; + u32 count; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } From a9eaff0f1f6f91842c968a880ca5075660eea494 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 14 May 2015 12:58:08 +0530 Subject: [PATCH 289/420] UPSTREAM: Bluetooth: Fix potential NULL dereference in RFCOMM bind callback (cherry picked from 951b6a0717db97ce420547222647bcc40bf1eacd) addr can be NULL and it should not be dereferenced before NULL checking. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann Change-Id: I18bda54bb1427d9443a39a04a5c551720118dc26 Bug: 30149612 --- net/bluetooth/rfcomm/sock.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 8bbbb5ec468c37..bcb3160fefb4c4 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -334,16 +334,19 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { - struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sockaddr_rc sa; struct sock *sk = sock->sk; - int chan = sa->rc_channel; - int err = 0; - - BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr); + int len, err = 0; if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + memset(&sa, 0, sizeof(sa)); + len = min_t(unsigned int, sizeof(sa), addr_len); + memcpy(&sa, addr, len); + + BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr); + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -358,12 +361,13 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr write_lock(&rfcomm_sk_list.lock); - if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) { + if (sa.rc_channel && + __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ - bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr); - rfcomm_pi(sk)->channel = chan; + bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr); + rfcomm_pi(sk)->channel = sa.rc_channel; sk->sk_state = BT_BOUND; } From a6c6e2b873a338a7ef5d8865bb6126e33a326175 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Sat, 13 Aug 2016 01:13:38 +0900 Subject: [PATCH 290/420] net: ipv6: Fix ping to link-local addresses. ping_v6_sendmsg does not set flowi6_oif in response to sin6_scope_id or sk_bound_dev_if, so it is not possible to use these APIs to ping an IPv6 address on a different interface. Instead, it sets flowi6_iif, which is incorrect but harmless. Stop setting flowi6_iif, and support various ways of setting oif in the same priority order used by udpv6_sendmsg. [Backport of net 5e457896986e16c440c97bb94b9ccd95dd157292] Bug: 29370996 Change-Id: Ibe1b9434c00ed96f1e30acb110734c6570b087b8 Tested: https://android-review.googlesource.com/#/c/254470/ Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/ping.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 75aa8c1c403bec..51f5bce1176172 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -85,7 +85,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; - int iif = 0; + int oif = 0; struct flowi6 fl6; int err; int hlimit; @@ -107,25 +107,30 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != u->sin6_scope_id) { - return -EINVAL; - } daddr = &(u->sin6_addr); - iif = u->sin6_scope_id; + if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) + oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; } - if (!iif) - iif = sk->sk_bound_dev_if; + if (!oif) + oif = sk->sk_bound_dev_if; + + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; + + if (!oif && ipv6_addr_is_multicast(daddr)) + oif = np->mcast_oif; + else if (!oif) + oif = np->ucast_oif; addr_type = ipv6_addr_type(daddr); - if (__ipv6_addr_needs_scope_id(addr_type) && !iif) - return -EINVAL; - if (addr_type & IPV6_ADDR_MAPPED) + if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || + (addr_type & IPV6_ADDR_MAPPED) || + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) return -EINVAL; /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ @@ -135,17 +140,13 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; + fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sock_i_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -155,11 +156,6 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (!np) return -EBADF; - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; pfh.icmph.checksum = 0; From e7c4b87f6e7e8ab70a90c249c6fbd442cd2e70be Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 11 Jun 2016 20:32:06 +0200 Subject: [PATCH 291/420] ipv6: fix endianness error in icmpv6_err IPv6 ping socket error handler doesn't correctly convert the new 32 bit mtu to host endianness before using. [Cherry-pick of net dcb94b88c09ce82a80e188d49bcffdc83ba215a6] Bug: 29370996 Change-Id: Iea0ca79f16c2a1366d82b3b0a3097093d18da8b7 Cc: Lorenzo Colitti Fixes: 6d0bfe22611602f ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Hannes Frederic Sowa Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 070d591332c30e..1415f1327dd666 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -97,7 +97,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) - ping_err(skb, offset, info); + ping_err(skb, offset, ntohl(info)); } static int icmpv6_rcv(struct sk_buff *skb); From 803bc2a4f224c2183d7cc9ae771f5d598777ad5c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 13 Jan 2016 17:48:01 +0100 Subject: [PATCH 292/420] UPSTREAM: ALSA: timer: Fix race among timer ioctls (cherry picked from commit af368027a49a751d6ff4ee9e3f9961f35bb4fede) ALSA timer ioctls have an open race and this may lead to a use-after-free of timer instance object. A simplistic fix is to make each ioctl exclusive. We have already tread_sem for controlling the tread, and extend this as a global mutex to be applied to each ioctl. The downside is, of course, the worse concurrency. But these ioctls aren't to be parallel accessible, in anyway, so it should be fine to serialize there. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Signed-off-by: Takashi Iwai Change-Id: I1ac52f1cba5e7408fd88c8fc1c30ca2e83967ebb Bug: 28694392 --- sound/core/timer.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 275b64bf5fd744..caf09280f4d5d5 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -73,7 +73,7 @@ struct snd_timer_user { struct timespec tstamp; /* trigger tstamp */ wait_queue_head_t qchange_sleep; struct fasync_struct *fasync; - struct mutex tread_sem; + struct mutex ioctl_lock; }; /* list of timers */ @@ -1269,7 +1269,7 @@ static int snd_timer_user_open(struct inode *inode, struct file *file) return -ENOMEM; spin_lock_init(&tu->qlock); init_waitqueue_head(&tu->qchange_sleep); - mutex_init(&tu->tread_sem); + mutex_init(&tu->ioctl_lock); tu->ticks = 1; tu->queue_size = 128; tu->queue = kmalloc(tu->queue_size * sizeof(struct snd_timer_read), @@ -1289,8 +1289,10 @@ static int snd_timer_user_release(struct inode *inode, struct file *file) if (file->private_data) { tu = file->private_data; file->private_data = NULL; + mutex_lock(&tu->ioctl_lock); if (tu->timeri) snd_timer_close(tu->timeri); + mutex_unlock(&tu->ioctl_lock); kfree(tu->queue); kfree(tu->tqueue); kfree(tu); @@ -1528,7 +1530,6 @@ static int snd_timer_user_tselect(struct file *file, int err = 0; tu = file->private_data; - mutex_lock(&tu->tread_sem); if (tu->timeri) { snd_timer_close(tu->timeri); tu->timeri = NULL; @@ -1572,7 +1573,6 @@ static int snd_timer_user_tselect(struct file *file, } __err: - mutex_unlock(&tu->tread_sem); return err; } @@ -1786,7 +1786,7 @@ enum { SNDRV_TIMER_IOCTL_PAUSE_OLD = _IO('T', 0x23), }; -static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, +static long __snd_timer_user_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_timer_user *tu; @@ -1803,17 +1803,11 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, { int xarg; - mutex_lock(&tu->tread_sem); - if (tu->timeri) { /* too late */ - mutex_unlock(&tu->tread_sem); + if (tu->timeri) /* too late */ return -EBUSY; - } - if (get_user(xarg, p)) { - mutex_unlock(&tu->tread_sem); + if (get_user(xarg, p)) return -EFAULT; - } tu->tread = xarg ? 1 : 0; - mutex_unlock(&tu->tread_sem); return 0; } case SNDRV_TIMER_IOCTL_GINFO: @@ -1846,6 +1840,18 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, return -ENOTTY; } +static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct snd_timer_user *tu = file->private_data; + long ret; + + mutex_lock(&tu->ioctl_lock); + ret = __snd_timer_user_ioctl(file, cmd, arg); + mutex_unlock(&tu->ioctl_lock); + return ret; +} + static int snd_timer_user_fasync(int fd, struct file * file, int on) { struct snd_timer_user *tu; From d519f95a16e78bbe32986f5d3ccec79f0e67ab39 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 11 Feb 2016 19:37:27 +0000 Subject: [PATCH 293/420] UPSTREAM: af_unix: Guard against other == sk in unix_dgram_sendmsg (cherry picked from commit a5527dda344fff0514b7989ef7a755729769daa1) The unix_dgram_sendmsg routine use the following test if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { to determine if sk and other are in an n:1 association (either established via connect or by using sendto to send messages to an unrelated socket identified by address). This isn't correct as the specified address could have been bound to the sending socket itself or because this socket could have been connected to itself by the time of the unix_peer_get but disconnected before the unix_state_lock(other). In both cases, the if-block would be entered despite other == sk which might either block the sender unintentionally or lead to trying to unlock the same spin lock twice for a non-blocking send. Add a other != sk check to guard against this. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Reported-By: Philipp Hahn Signed-off-by: Rainer Weikusat Tested-by: Philipp Hahn Signed-off-by: David S. Miller Fixes: Change-Id: Ia374ee061195088f8c777940baa75cedbe897f4e ("UPSTREAM: unix: avoid use-after-free in ep_remove_wait_queue") Change-Id: I4ebef6a390df3487903b166b837e34c653e01cb2 Signed-off-by: Amit Pundir --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b11562ed376c7c..7476ace9bf6778 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1706,7 +1706,12 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); From 5d75a20fb753fd18e410148a035dbf8b3ebfcf0b Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Tue, 16 Aug 2016 12:16:07 -0700 Subject: [PATCH 294/420] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. Bug: 30677035 Change-Id: Ic6c2ef660aa686e2b32142af4769a3b9e1d0a172 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 80 +++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 +++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 5 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 80 +++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 +++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 318 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0421b53e6431fe..ea05c23cca16dc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3321,3 +3321,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + int pct, num_elem; + u_int64_t elem; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %d):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %d):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index ee68bf50b90b53..4dd3ad85f5c2b0 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -160,6 +160,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -546,6 +557,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1782,7 +1798,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2740,6 +2756,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 69664ec2276960..f33b50d1d726b6 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -30,8 +30,6 @@ #include "core.h" #include "host.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static void mmc_host_classdev_release(struct device *dev) { struct mmc_host *host = cls_dev_to_mmc_host(dev); @@ -558,6 +556,7 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif mmc_host_clk_sysfs_init(host); + mmc_latency_hist_sysfs_init(host); mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) @@ -587,6 +586,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index f2ab9e5781265c..4dcf49937ecd42 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,8 +12,13 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 605ca60e8a10da..64b1b2ba3a445b 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1313,6 +1314,16 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3051,6 +3062,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3074,6 +3086,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5248,6 +5276,54 @@ int ufshcd_shutdown(struct ufs_hba *hba) } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5263,6 +5339,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5552,6 +5629,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5566,6 +5645,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 4a574aa458557a..241810c8309927 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -473,6 +473,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aac0f9ea952ad5..a5ceda29ab1b46 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -209,6 +209,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1622,6 +1625,79 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index f206e29f94d72c..649ec17d48ae62 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -135,6 +135,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index bf2252a6fe3afd..d456d524ce65e6 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -384,6 +385,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From 0b0a3c9ffcf2d249e2dbe5f226974de439d6f332 Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Thu, 25 Aug 2016 01:00:06 +0000 Subject: [PATCH 295/420] Revert "Android: MMC/UFS IO Latency Histograms." This reverts commit 5d75a20fb753fd18e410148a035dbf8b3ebfcf0b. Change-Id: I291e7bd0ddf6e45d2e3bf0d940df79e357efe421 --- block/blk-core.c | 80 --------------------------------------- drivers/mmc/core/core.c | 66 +------------------------------- drivers/mmc/core/host.c | 5 +-- drivers/mmc/core/host.h | 5 --- drivers/scsi/ufs/ufshcd.c | 80 --------------------------------------- drivers/scsi/ufs/ufshcd.h | 3 -- include/linux/blkdev.h | 76 ------------------------------------- include/linux/mmc/core.h | 2 - include/linux/mmc/host.h | 4 -- 9 files changed, 3 insertions(+), 318 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ea05c23cca16dc..0421b53e6431fe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3321,83 +3321,3 @@ int __init blk_dev_init(void) return 0; } - -/* - * Blk IO latency support. We want this to be as cheap as possible, so doing - * this lockless (and avoiding atomics), a few off by a few errors in this - * code is not harmful, and we don't want to do anything that is - * perf-impactful. - * TODO : If necessary, we can make the histograms per-cpu and aggregate - * them when printing them out. - */ -void -blk_zero_latency_hist(struct io_latency_state *s) -{ - memset(s->latency_y_axis_read, 0, - sizeof(s->latency_y_axis_read)); - memset(s->latency_y_axis_write, 0, - sizeof(s->latency_y_axis_write)); - s->latency_reads_elems = 0; - s->latency_writes_elems = 0; -} - -ssize_t -blk_latency_hist_show(struct io_latency_state *s, char *buf) -{ - int i; - int bytes_written = 0; - int pct, num_elem; - u_int64_t elem; - - num_elem = s->latency_reads_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Read Latency Histogram (n = %d):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_read[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_read[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - num_elem = s->latency_writes_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Write Latency Histogram (n = %d):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_write[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_write[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - return bytes_written; -} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 4dd3ad85f5c2b0..ee68bf50b90b53 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -160,17 +160,6 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); - if (mrq->lat_hist_enabled) { - ktime_t completion; - u_int64_t delta_us; - - completion = ktime_get(); - delta_us = ktime_us_delta(completion, - mrq->io_start); - blk_update_latency_hist(&host->io_lat_s, - (mrq->data->flags & MMC_DATA_READ), - delta_us); - } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -557,11 +546,6 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { - if (host->latency_hist_enabled) { - areq->mrq->io_start = ktime_get(); - areq->mrq->lat_hist_enabled = 1; - } else - areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1798,7 +1782,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2756,54 +2740,6 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } -static ssize_t -latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mmc_host *host = cls_dev_to_mmc_host(dev); - - return blk_latency_hist_show(&host->io_lat_s, buf); -} - -/* - * Values permitted 0, 1, 2. - * 0 -> Disable IO latency histograms (default) - * 1 -> Enable IO latency histograms - * 2 -> Zero out IO latency histograms - */ -static ssize_t -latency_hist_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mmc_host *host = cls_dev_to_mmc_host(dev); - long value; - - if (kstrtol(buf, 0, &value)) - return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&host->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || - value == BLK_IO_LAT_HIST_DISABLE) - host->latency_hist_enabled = value; - return count; -} - -static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, - latency_hist_show, latency_hist_store); - -void -mmc_latency_hist_sysfs_init(struct mmc_host *host) -{ - if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) - dev_err(&host->class_dev, - "Failed to create latency_hist sysfs entry\n"); -} - -void -mmc_latency_hist_sysfs_exit(struct mmc_host *host) -{ - device_remove_file(&host->class_dev, &dev_attr_latency_hist); -} - subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index f33b50d1d726b6..69664ec2276960 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -30,6 +30,8 @@ #include "core.h" #include "host.h" +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + static void mmc_host_classdev_release(struct device *dev) { struct mmc_host *host = cls_dev_to_mmc_host(dev); @@ -556,7 +558,6 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif mmc_host_clk_sysfs_init(host); - mmc_latency_hist_sysfs_init(host); mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) @@ -586,8 +587,6 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif - mmc_latency_hist_sysfs_exit(host); - device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 4dcf49937ecd42..f2ab9e5781265c 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,13 +12,8 @@ #define _MMC_CORE_HOST_H #include -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - int mmc_register_host_class(void); void mmc_unregister_host_class(void); -void mmc_latency_hist_sysfs_init(struct mmc_host *host); -void mmc_latency_hist_sysfs_exit(struct mmc_host *host); - #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 64b1b2ba3a445b..605ca60e8a10da 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,7 +39,6 @@ #include #include -#include #include "ufshcd.h" #include "unipro.h" @@ -1314,16 +1313,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } - /* IO svc time latency histogram */ - if (hba != NULL && cmd->request != NULL) { - if (hba->latency_hist_enabled && - (cmd->request->cmd_type == REQ_TYPE_FS)) { - cmd->request->lat_hist_io_start = ktime_get(); - cmd->request->lat_hist_enabled = 1; - } else - cmd->request->lat_hist_enabled = 0; - } - WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3062,7 +3051,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; - struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3086,22 +3074,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); - req = cmd->request; - if (req) { - /* Update IO svc time latency histogram */ - if (req->lat_hist_enabled) { - ktime_t completion; - u_int64_t delta_us; - - completion = ktime_get(); - delta_us = ktime_us_delta(completion, - req->lat_hist_io_start); - /* rq_data_dir() => true if WRITE */ - blk_update_latency_hist(&hba->io_lat_s, - (rq_data_dir(req) == READ), - delta_us); - } - } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5276,54 +5248,6 @@ int ufshcd_shutdown(struct ufs_hba *hba) } EXPORT_SYMBOL(ufshcd_shutdown); -/* - * Values permitted 0, 1, 2. - * 0 -> Disable IO latency histograms (default) - * 1 -> Enable IO latency histograms - * 2 -> Zero out IO latency histograms - */ -static ssize_t -latency_hist_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ufs_hba *hba = dev_get_drvdata(dev); - long value; - - if (kstrtol(buf, 0, &value)) - return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&hba->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || - value == BLK_IO_LAT_HIST_DISABLE) - hba->latency_hist_enabled = value; - return count; -} - -ssize_t -latency_hist_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct ufs_hba *hba = dev_get_drvdata(dev); - - return blk_latency_hist_show(&hba->io_lat_s, buf); -} - -static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, - latency_hist_show, latency_hist_store); - -static void -ufshcd_init_latency_hist(struct ufs_hba *hba) -{ - if (device_create_file(hba->dev, &dev_attr_latency_hist)) - dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); -} - -static void -ufshcd_exit_latency_hist(struct ufs_hba *hba) -{ - device_create_file(hba->dev, &dev_attr_latency_hist); -} - /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5339,7 +5263,6 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); - ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5629,8 +5552,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); - ufshcd_init_latency_hist(hba); - /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5645,7 +5566,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); - ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 241810c8309927..4a574aa458557a 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -473,9 +473,6 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; - - int latency_hist_enabled; - struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a5ceda29ab1b46..aac0f9ea952ad5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -209,9 +209,6 @@ struct request { /* for bidi */ struct request *next_rq; - - ktime_t lat_hist_io_start; - int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1625,79 +1622,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); - -/* - * X-axis for IO latency histogram support. - */ -static const u_int64_t latency_x_axis_us[] = { - 100, - 200, - 300, - 400, - 500, - 600, - 700, - 800, - 900, - 1000, - 1200, - 1400, - 1600, - 1800, - 2000, - 2500, - 3000, - 4000, - 5000, - 6000, - 7000, - 9000, - 10000 -}; - -#define BLK_IO_LAT_HIST_DISABLE 0 -#define BLK_IO_LAT_HIST_ENABLE 1 -#define BLK_IO_LAT_HIST_ZERO 2 - -struct io_latency_state { - u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_reads_elems; - u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_writes_elems; -}; - -static inline void -blk_update_latency_hist(struct io_latency_state *s, - int read, - u_int64_t delta_us) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { - if (delta_us < (u_int64_t)latency_x_axis_us[i]) { - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - break; - } - } - if (i == ARRAY_SIZE(latency_x_axis_us)) { - /* Overflowed the histogram */ - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - } - if (read) - s->latency_reads_elems++; - else - s->latency_writes_elems++; -} - -void blk_zero_latency_hist(struct io_latency_state *s); -ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); - #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 649ec17d48ae62..f206e29f94d72c 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -135,8 +135,6 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; - ktime_t io_start; - int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index d456d524ce65e6..bf2252a6fe3afd 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -385,9 +384,6 @@ struct mmc_host { } embedded_sdio_data; #endif - int latency_hist_enabled; - struct io_latency_state io_lat_s; - unsigned long private[0] ____cacheline_aligned; }; From 6a8a0e19913298e23a004efca206447fe1119407 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 24 Aug 2016 11:52:17 +0530 Subject: [PATCH 296/420] sched/walt: use do_div instead of division operator Use do_div() instead of "/" operator to fix undefined references to "__aeabi_uldivmod" build error for ARCH=arm. Also in TP_fast_assign(), along with do_div() usage, replace "," with ";" which would have resulted in a syntax error (!), because '#define TP_fast_assign(args...) args' would have stripped off the "," and left white space between these two assignments after CPP phase. Change-Id: I095f9cfb4dd9d58ef20cbb9c58b0711be6df9da3 Signed-off-by: Amit Pundir --- include/trace/events/sched.h | 3 ++- kernel/sched/sched.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 6505343ba812da..dec3688c6ef996 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1058,7 +1058,8 @@ TRACE_EVENT(walt_update_history, __entry->samples = samples; __entry->evt = evt; __entry->demand = p->ravg.demand; - __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->walt_avg = (__entry->demand << 10); + do_div(__entry->walt_avg, walt_ravg_window); __entry->pelt_avg = p->se.avg.util_avg; memcpy(__entry->hist, p->ravg.sum_history, RAVG_HIST_SIZE_MAX * sizeof(u32)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8e37ab8dbd09ef..fa0e65b00a5303 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1518,9 +1518,10 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) - util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / - walt_ravg_window; + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } #endif delta += util; if (delta < 0) From e774b3b5ef9fe1c772eb2c23a69f5eef07383c51 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 25 Aug 2016 11:06:37 +0530 Subject: [PATCH 297/420] sched/walt: include missing header for arm_timer_read_counter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include clocksource/arm_arch_timer.h to fix implicit function declaration of ‘arch_timer_read_counter’ build error for ARCH=arm. Signed-off-by: Amit Pundir --- kernel/sched/walt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index d9d09914ce30f3..07b7f84b37e22d 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "sched.h" #include "walt.h" From 3ec17c6dacc04032d679d4314cc2b8ef93c4f48f Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 4 May 2016 18:56:45 -0700 Subject: [PATCH 298/420] arm: Fix build error "conflicting types for 'scale_cpu_capacity'" Commit "arm: Update arch_scale_cpu_capacity() to reflect change to define" introduced a dependency on struct sched_domain in arch/arm/include/asm/topologoy.h, but that structure is only currently defined if CONFIG_CPU_FREQ is enabled, which causes include/linux/cpufreq.h to get pulled in which defines it. Include regardless of CONFIG_CPU_FREQ so struct sched_domain is always defined. Fixes: Change-Id: I372bd5e4c1e203428d72b18c8a806b06f3567ef6 ("arm: Update arch_scale_cpu_capacity() to reflect change to define") Signed-off-by: Steve Muckle Signed-off-by: Amit Pundir --- arch/arm/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 94d3265019cb5c..7c8fc955e0df64 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -3,6 +3,7 @@ #ifdef CONFIG_ARM_CPU_TOPOLOGY +#include #include struct cputopo_arm { @@ -25,7 +26,6 @@ void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef CONFIG_CPU_FREQ -#include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif #define arch_scale_cpu_capacity scale_cpu_capacity From 90aeadf8ca3d93bd2c7a6f3eecf9016c95141b29 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 2 Jun 2016 12:18:08 +0000 Subject: [PATCH 299/420] arm: Fix #if/#ifdef typo in topology.c Probably a typo in arch/arm/kernel/topology.c This patch fixes the warning... arch/arm/kernel/topology.c: In function 'scale_cpu_capacity': arch/arm/kernel/topology.c:47:5: warning: "CONFIG_CPU_FREQ" is not defined [-Wundef] Fixes: Change-Id: If5e9e0ba8ff5a5d3236b373dbce8c72ea71b5e18 ("arm: Enable max freq invariant scheduler load-tracking and capacity support") Signed-off-by: Jon Medhurst Signed-off-by: Amit Pundir --- arch/arm/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 18ea2fd925fa56..7ee74a02658295 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,7 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { -#if CONFIG_CPU_FREQ +#ifdef CONFIG_CPU_FREQ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; From 79becfb7dbd491ad4cfb01d471c9c821bdb5817d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 29 Aug 2016 20:31:38 +0530 Subject: [PATCH 300/420] =?UTF-8?q?kernel:=20kcov:=20include=20missing=20h?= =?UTF-8?q?eader=20for=20=E2=80=98struct=20task=5Fstruct=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include linux/sched.h to fix "dereferencing pointer to incomplete type ‘struct task_struct’" build error for ARCH=x86_64. Signed-off-by: Amit Pundir --- kernel/kcov.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kcov.c b/kernel/kcov.c index 3efbee0834a85d..e228cb1894d5f2 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * kcov descriptor (one per opened debugfs file). From 7ad4c18c51bc595adb17eb737e0250bc7ce2e15d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:33:55 -0700 Subject: [PATCH 301/420] UPSTREAM: USB: mct_u232: add sanity checking in probe commit 4e9a0b05257f29cf4b75f3209243ed71614d062e upstream. An attack using the lack of sanity checking in probe is known. This patch checks for the existence of a second port. CVE-2016-3136 BUG: 28242610 Signed-off-by: Oliver Neukum [johan: add error message ] Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I284ad648c2087c34a098d67e0cc6d948a568413c (cherry picked from commit 0cd0c38fa2cef1cad28a2a9ecf9a60cff4cd4038) --- drivers/usb/serial/mct_u232.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c index fd707d6a10e263..89726f702202f4 100644 --- a/drivers/usb/serial/mct_u232.c +++ b/drivers/usb/serial/mct_u232.c @@ -376,14 +376,21 @@ static void mct_u232_msr_to_state(struct usb_serial_port *port, static int mct_u232_port_probe(struct usb_serial_port *port) { + struct usb_serial *serial = port->serial; struct mct_u232_private *priv; + /* check first to simplify error handling */ + if (!serial->port[1] || !serial->port[1]->interrupt_in_urb) { + dev_err(&port->dev, "expected endpoint missing\n"); + return -ENODEV; + } + priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; /* Use second interrupt-in endpoint for reading. */ - priv->read_urb = port->serial->port[1]->interrupt_in_urb; + priv->read_urb = serial->port[1]->interrupt_in_urb; priv->read_urb->context = port; spin_lock_init(&priv->lock); From d6a9a74487e86b528c44965f871de75671b6adb0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 4 Sep 2015 15:42:45 -0700 Subject: [PATCH 302/420] UPSTREAM: capabilities: ambient capabilities Credit where credit is due: this idea comes from Christoph Lameter with a lot of valuable input from Serge Hallyn. This patch is heavily based on Christoph's patch. ===== The status quo ===== On Linux, there are a number of capabilities defined by the kernel. To perform various privileged tasks, processes can wield capabilities that they hold. Each task has four capability masks: effective (pE), permitted (pP), inheritable (pI), and a bounding set (X). When the kernel checks for a capability, it checks pE. The other capability masks serve to modify what capabilities can be in pE. Any task can remove capabilities from pE, pP, or pI at any time. If a task has a capability in pP, it can add that capability to pE and/or pI. If a task has CAP_SETPCAP, then it can add any capability to pI, and it can remove capabilities from X. Tasks are not the only things that can have capabilities; files can also have capabilities. A file can have no capabilty information at all [1]. If a file has capability information, then it has a permitted mask (fP) and an inheritable mask (fI) as well as a single effective bit (fE) [2]. File capabilities modify the capabilities of tasks that execve(2) them. A task that successfully calls execve has its capabilities modified for the file ultimately being excecuted (i.e. the binary itself if that binary is ELF or for the interpreter if the binary is a script.) [3] In the capability evolution rules, for each mask Z, pZ represents the old value and pZ' represents the new value. The rules are: pP' = (X & fP) | (pI & fI) pI' = pI pE' = (fE ? pP' : 0) X is unchanged For setuid binaries, fP, fI, and fE are modified by a moderately complicated set of rules that emulate POSIX behavior. Similarly, if euid == 0 or ruid == 0, then fP, fI, and fE are modified differently (primary, fP and fI usually end up being the full set). For nonroot users executing binaries with neither setuid nor file caps, fI and fP are empty and fE is false. As an extra complication, if you execute a process as nonroot and fE is set, then the "secure exec" rules are in effect: AT_SECURE gets set, LD_PRELOAD doesn't work, etc. This is rather messy. We've learned that making any changes is dangerous, though: if a new kernel version allows an unprivileged program to change its security state in a way that persists cross execution of a setuid program or a program with file caps, this persistent state is surprisingly likely to allow setuid or file-capped programs to be exploited for privilege escalation. ===== The problem ===== Capability inheritance is basically useless. If you aren't root and you execute an ordinary binary, fI is zero, so your capabilities have no effect whatsoever on pP'. This means that you can't usefully execute a helper process or a shell command with elevated capabilities if you aren't root. On current kernels, you can sort of work around this by setting fI to the full set for most or all non-setuid executable files. This causes pP' = pI for nonroot, and inheritance works. No one does this because it's a PITA and it isn't even supported on most filesystems. If you try this, you'll discover that every nonroot program ends up with secure exec rules, breaking many things. This is a problem that has bitten many people who have tried to use capabilities for anything useful. ===== The proposed change ===== This patch adds a fifth capability mask called the ambient mask (pA). pA does what most people expect pI to do. pA obeys the invariant that no bit can ever be set in pA if it is not set in both pP and pI. Dropping a bit from pP or pI drops that bit from pA. This ensures that existing programs that try to drop capabilities still do so, with a complication. Because capability inheritance is so broken, setting KEEPCAPS, using setresuid to switch to nonroot uids, and then calling execve effectively drops capabilities. Therefore, setresuid from root to nonroot conditionally clears pA unless SECBIT_NO_SETUID_FIXUP is set. Processes that don't like this can re-add bits to pA afterwards. The capability evolution rules are changed: pA' = (file caps or setuid or setgid ? 0 : pA) pP' = (X & fP) | (pI & fI) | pA' pI' = pI pE' = (fE ? pP' : pA') X is unchanged If you are nonroot but you have a capability, you can add it to pA. If you do so, your children get that capability in pA, pP, and pE. For example, you can set pA = CAP_NET_BIND_SERVICE, and your children can automatically bind low-numbered ports. Hallelujah! Unprivileged users can create user namespaces, map themselves to a nonzero uid, and create both privileged (relative to their namespace) and unprivileged process trees. This is currently more or less impossible. Hallelujah! You cannot use pA to try to subvert a setuid, setgid, or file-capped program: if you execute any such program, pA gets cleared and the resulting evolution rules are unchanged by this patch. Users with nonzero pA are unlikely to unintentionally leak that capability. If they run programs that try to drop privileges, dropping privileges will still work. It's worth noting that the degree of paranoia in this patch could possibly be reduced without causing serious problems. Specifically, if we allowed pA to persist across executing non-pA-aware setuid binaries and across setresuid, then, naively, the only capabilities that could leak as a result would be the capabilities in pA, and any attacker *already* has those capabilities. This would make me nervous, though -- setuid binaries that tried to privilege-separate might fail to do so, and putting CAP_DAC_READ_SEARCH or CAP_DAC_OVERRIDE into pA could have unexpected side effects. (Whether these unexpected side effects would be exploitable is an open question.) I've therefore taken the more paranoid route. We can revisit this later. An alternative would be to require PR_SET_NO_NEW_PRIVS before setting ambient capabilities. I think that this would be annoying and would make granting otherwise unprivileged users minor ambient capabilities (CAP_NET_BIND_SERVICE or CAP_NET_RAW for example) much less useful than it is with this patch. ===== Footnotes ===== [1] Files that are missing the "security.capability" xattr or that have unrecognized values for that xattr end up with has_cap set to false. The code that does that appears to be complicated for no good reason. [2] The libcap capability mask parsers and formatters are dangerously misleading and the documentation is flat-out wrong. fE is *not* a mask; it's a single bit. This has probably confused every single person who has tried to use file capabilities. [3] Linux very confusingly processes both the script and the interpreter if applicable, for reasons that elude me. The results from thinking about a script's file capabilities and/or setuid bits are mostly discarded. Preliminary userspace code is here, but it needs updating: https://git.kernel.org/cgit/linux/kernel/git/luto/util-linux-playground.git/commit/?h=cap_ambient&id=7f5afbd175d2 Here is a test program that can be used to verify the functionality (from Christoph): /* * Test program for the ambient capabilities. This program spawns a shell * that allows running processes with a defined set of capabilities. * * (C) 2015 Christoph Lameter * Released under: GPL v3 or later. * * * Compile using: * * gcc -o ambient_test ambient_test.o -lcap-ng * * This program must have the following capabilities to run properly: * Permissions for CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE * * A command to equip the binary with the right caps is: * * setcap cap_net_raw,cap_net_admin,cap_sys_nice+p ambient_test * * * To get a shell with additional caps that can be inherited by other processes: * * ./ambient_test /bin/bash * * * Verifying that it works: * * From the bash spawed by ambient_test run * * cat /proc/$$/status * * and have a look at the capabilities. */ /* * Definitions from the kernel header files. These are going to be removed * when the /usr/include files have these defined. */ static void set_ambient_cap(int cap) { int rc; capng_get_caps_process(); rc = capng_update(CAPNG_ADD, CAPNG_INHERITABLE, cap); if (rc) { printf("Cannot add inheritable cap\n"); exit(2); } capng_apply(CAPNG_SELECT_CAPS); /* Note the two 0s at the end. Kernel checks for these */ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) { perror("Cannot set cap"); exit(1); } } int main(int argc, char **argv) { int rc; set_ambient_cap(CAP_NET_RAW); set_ambient_cap(CAP_NET_ADMIN); set_ambient_cap(CAP_SYS_NICE); printf("Ambient_test forking shell\n"); if (execv(argv[1], argv + 1)) perror("Cannot exec"); return 0; } Signed-off-by: Christoph Lameter # Original author Signed-off-by: Andy Lutomirski Acked-by: Serge E. Hallyn Acked-by: Kees Cook Cc: Jonathan Corbet Cc: Aaron Jones Cc: Ted Ts'o Cc: Andrew G. Morgan Cc: Mimi Zohar Cc: Austin S Hemmelgarn Cc: Markku Savela Cc: Jarkko Sakkinen Cc: Michael Kerrisk Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 58319057b7847667f0c9585b9de0e8932b0fdb08) Bug: 31038224 Change-Id: Idab768286dffe9873a8ab0934f3824fa5905a06d Signed-off-by: Jorge Lucangeli Obes --- fs/proc/array.c | 5 +- include/linux/cred.h | 8 +++ include/uapi/linux/prctl.h | 7 +++ kernel/user_namespace.c | 1 + security/commoncap.c | 102 +++++++++++++++++++++++++++++++---- security/keys/process_keys.c | 1 + 6 files changed, 113 insertions(+), 11 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index cd3653e4f35c66..1189ee06d35493 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -305,7 +305,8 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; - kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, + cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); @@ -313,12 +314,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; + cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); + render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) diff --git a/include/linux/cred.h b/include/linux/cred.h index b2d0820837c425..f17c70fd47d835 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -121,6 +121,7 @@ struct cred { kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ + kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ @@ -196,6 +197,13 @@ static inline void validate_process_creds(void) } #endif +static inline bool cap_ambient_invariant_ok(const struct cred *cred) +{ + return cap_issubset(cred->cap_ambient, + cap_intersect(cred->cap_permitted, + cred->cap_inheritable)); +} + /** * get_new_cred - Get a reference on a new set of credentials * @cred: The new credentials to reference diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 933ff2a9a032d5..3fc82ff5d2ddfc 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -188,4 +188,11 @@ struct prctl_mm_map { #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 +/* Control the ambient capability set */ +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa312b0dc3ec25..f2d77a1944e553 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -38,6 +38,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; + cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); diff --git a/security/commoncap.c b/security/commoncap.c index 7ae25198d8efe5..1e5b94360bece3 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -283,6 +283,16 @@ int cap_capset(struct cred *new, new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; + + /* + * Mask off ambient bits that are no longer both permitted and + * inheritable. + */ + new->cap_ambient = cap_intersect(new->cap_ambient, + cap_intersect(*permitted, + *inheritable)); + if (WARN_ON(!cap_ambient_invariant_ok(new))) + return -EINVAL; return 0; } @@ -363,6 +373,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, /* * pP' = (X & fP) | (pI & fI) + * The addition of pA' is handled later. */ new->cap_permitted.cap[i] = (new->cap_bset.cap[i] & permitted) | @@ -494,10 +505,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; - bool effective, has_cap = false; + bool effective, has_cap = false, is_setid; int ret; kuid_t root_uid; + if (WARN_ON(!cap_ambient_invariant_ok(old))) + return -EPERM; + effective = false; ret = get_file_caps(bprm, &effective, &has_cap); if (ret < 0) @@ -542,8 +556,9 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ - if ((!uid_eq(new->euid, old->uid) || - !gid_eq(new->egid, old->gid) || + is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid); + + if ((is_setid || !cap_issubset(new->cap_permitted, old->cap_permitted)) && bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { /* downgrade; they get no more than they had, and maybe less */ @@ -559,10 +574,28 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; + /* File caps or setid cancels ambient. */ + if (has_cap || is_setid) + cap_clear(new->cap_ambient); + + /* + * Now that we've computed pA', update pP' to give: + * pP' = (X & fP) | (pI & fI) | pA' + */ + new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); + + /* + * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, + * this is the same as pE' = (fE ? pP' : 0) | pA'. + */ if (effective) new->cap_effective = new->cap_permitted; else - cap_clear(new->cap_effective); + new->cap_effective = new->cap_ambient; + + if (WARN_ON(!cap_ambient_invariant_ok(new))) + return -EPERM; + bprm->cap_effective = effective; /* @@ -577,7 +610,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. */ - if (!cap_isclear(new->cap_effective)) { + if (!cap_issubset(new->cap_effective, new->cap_ambient)) { if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || issecure(SECURE_NOROOT)) { @@ -588,6 +621,10 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); + + if (WARN_ON(!cap_ambient_invariant_ok(new))) + return -EPERM; + return 0; } @@ -609,7 +646,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm) if (!uid_eq(cred->uid, root_uid)) { if (bprm->cap_effective) return 1; - if (!cap_isclear(cred->cap_permitted)) + if (!cap_issubset(cred->cap_permitted, cred->cap_ambient)) return 1; } @@ -711,10 +748,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) uid_eq(old->suid, root_uid)) && (!uid_eq(new->uid, root_uid) && !uid_eq(new->euid, root_uid) && - !uid_eq(new->suid, root_uid)) && - !issecure(SECURE_KEEP_CAPS)) { - cap_clear(new->cap_permitted); - cap_clear(new->cap_effective); + !uid_eq(new->suid, root_uid))) { + if (!issecure(SECURE_KEEP_CAPS)) { + cap_clear(new->cap_permitted); + cap_clear(new->cap_effective); + } + + /* + * Pre-ambient programs expect setresuid to nonroot followed + * by exec to drop capabilities. We should make sure that + * this remains the case. + */ + cap_clear(new->cap_ambient); } if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) cap_clear(new->cap_effective); @@ -944,6 +989,43 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); return commit_creds(new); + case PR_CAP_AMBIENT: + if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { + if (arg3 | arg4 | arg5) + return -EINVAL; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + cap_clear(new->cap_ambient); + return commit_creds(new); + } + + if (((!cap_valid(arg3)) | arg4 | arg5)) + return -EINVAL; + + if (arg2 == PR_CAP_AMBIENT_IS_SET) { + return !!cap_raised(current_cred()->cap_ambient, arg3); + } else if (arg2 != PR_CAP_AMBIENT_RAISE && + arg2 != PR_CAP_AMBIENT_LOWER) { + return -EINVAL; + } else { + if (arg2 == PR_CAP_AMBIENT_RAISE && + (!cap_raised(current_cred()->cap_permitted, arg3) || + !cap_raised(current_cred()->cap_inheritable, + arg3))) + return -EPERM; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + if (arg2 == PR_CAP_AMBIENT_RAISE) + cap_raise(new->cap_ambient, arg3); + else + cap_lower(new->cap_ambient, arg3); + return commit_creds(new); + } + default: /* No functionality available - continue with default */ return -ENOSYS; diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index db91639c81e3af..7877e5cd4e2347 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -849,6 +849,7 @@ void key_change_session_keyring(struct callback_head *twork) new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; + new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; From 58fac225391079746ebeecaee1ac2c771d97fb57 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 29 Aug 2016 19:48:17 +0530 Subject: [PATCH 303/420] DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpu curr capacity can only be traced for SMP systems. Non-SMP builds will fail with: drivers/cpufreq/cpufreq.c: In function ‘cpufreq_freq_transition_begin’: drivers/cpufreq/cpufreq.c:438:22: error: implicit declaration of function ‘capacity_curr_of’ [-Werror=implicit-function-declaration] trace_cpu_capacity(capacity_curr_of(cpu), cpu); ^ Fixes: Change-Id: Icd0930d11068fcb7d2b6a9a48e7ed974904e1081 ("DEBUG: sched,cpufreq: add cpu_capacity change tracepoint") Signed-off-by: Amit Pundir --- drivers/cpufreq/cpufreq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 19c816bb9bdef6..def4f84f92e530 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -28,7 +28,9 @@ #include #include #include +#ifdef CONFIG_SMP #include +#endif #include /** @@ -405,7 +407,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { +#ifdef CONFIG_SMP int cpu; +#endif /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -434,8 +438,10 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); +#ifdef CONFIG_SMP for_each_cpu(cpu, policy->cpus) trace_cpu_capacity(capacity_curr_of(cpu), cpu); +#endif cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } From 2d7a720ca4c70eccf7ad405bffd3fb8efe05c551 Mon Sep 17 00:00:00 2001 From: Yongqin Liu Date: Thu, 1 Sep 2016 22:06:04 +0530 Subject: [PATCH 304/420] ANDROID: base-cfg: enable SECCOMP config Enable seccomp config CONFIG_SECCOMP=y Otherwise we will get mediacode error like this on Android N: E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument Change-Id: I2477b6a2cfdded5c0ebf6ffbb6150b0e5fe2ba12 Signed-off-by: Yongqin Liu Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index f820d56b39ccd2..708e529ec0e26f 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -141,6 +141,7 @@ CONFIG_QUOTA=y CONFIG_RESOURCE_COUNTERS=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From 52a321dbb3a7814ba141b902594571dff05646ed Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Apr 2016 14:06:48 +0100 Subject: [PATCH 305/420] UPSTREAM: assoc_array: don't call compare_object() on a node (cherry picked from commit 8d4a2ec1e0b41b0cf9a0c5cd4511da7f8e4f3de2) Changes since V1: fixed the description and added KASan warning. In assoc_array_insert_into_terminal_node(), we call the compare_object() method on all non-empty slots, even when they're not leaves, passing a pointer to an unexpected structure to compare_object(). Currently it causes an out-of-bound read access in keyring_compare_object detected by KASan (see below). The issue is easily reproduced with keyutils testsuite. Only call compare_object() when the slot is a leave. KASan warning: ================================================================== BUG: KASAN: slab-out-of-bounds in keyring_compare_object+0x213/0x240 at addr ffff880060a6f838 Read of size 8 by task keyctl/1655 ============================================================================= BUG kmalloc-192 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in assoc_array_insert+0xfd0/0x3a60 age=69 cpu=1 pid=1647 ___slab_alloc+0x563/0x5c0 __slab_alloc+0x51/0x90 kmem_cache_alloc_trace+0x263/0x300 assoc_array_insert+0xfd0/0x3a60 __key_link_begin+0xfc/0x270 key_create_or_update+0x459/0xaf0 SyS_add_key+0x1ba/0x350 entry_SYSCALL_64_fastpath+0x12/0x76 INFO: Slab 0xffffea0001829b80 objects=16 used=8 fp=0xffff880060a6f550 flags=0x3fff8000004080 INFO: Object 0xffff880060a6f740 @offset=5952 fp=0xffff880060a6e5d1 Bytes b4 ffff880060a6f730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f740: d1 e5 a6 60 00 88 ff ff 0e 00 00 00 00 00 00 00 ...`............ Object ffff880060a6f750: 02 cf 8e 60 00 88 ff ff 02 c0 8e 60 00 88 ff ff ...`.......`.... Object ffff880060a6f760: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f770: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f790: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7d0: 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ CPU: 0 PID: 1655 Comm: keyctl Tainted: G B 4.5.0-rc4-kasan+ #291 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000000 000000001b2800b4 ffff880060a179e0 ffffffff81b60491 ffff88006c802900 ffff880060a6f740 ffff880060a17a10 ffffffff815e2969 ffff88006c802900 ffffea0001829b80 ffff880060a6f740 ffff880060a6e650 Call Trace: [] dump_stack+0x85/0xc4 [] print_trailer+0xf9/0x150 [] object_err+0x34/0x40 [] kasan_report_error+0x230/0x550 [] ? keyring_get_key_chunk+0x13e/0x210 [] __asan_report_load_n_noabort+0x5d/0x70 [] ? keyring_compare_object+0x213/0x240 [] keyring_compare_object+0x213/0x240 [] assoc_array_insert+0x86c/0x3a60 [] ? assoc_array_cancel_edit+0x70/0x70 [] ? __key_link_begin+0x20d/0x270 [] __key_link_begin+0xfc/0x270 [] key_create_or_update+0x459/0xaf0 [] ? trace_hardirqs_on+0xd/0x10 [] ? key_type_lookup+0xc0/0xc0 [] ? lookup_user_key+0x13d/0xcd0 [] ? memdup_user+0x53/0x80 [] SyS_add_key+0x1ba/0x350 [] ? key_get_type_from_user.constprop.6+0xa0/0xa0 [] ? retint_user+0x18/0x23 [] ? trace_hardirqs_on_caller+0x3fe/0x580 [] ? trace_hardirqs_on_thunk+0x17/0x19 [] entry_SYSCALL_64_fastpath+0x12/0x76 Memory state around the buggy address: ffff880060a6f700: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00 ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc >ffff880060a6f800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff880060a6f880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff880060a6f900: fc fc fc fc fc fc 00 00 00 00 00 00 00 00 00 00 ================================================================== Signed-off-by: Jerome Marchand Signed-off-by: David Howells cc: stable@vger.kernel.org Change-Id: I903935a221a5b9fb14cec14ef64bd2b6fa8eb222 Bug: 30513364 --- lib/assoc_array.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 2404d03e251a64..03a77f4740c124 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -523,7 +523,9 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, free_slot = i; continue; } - if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { + if (assoc_array_ptr_is_leaf(ptr) && + ops->compare_object(assoc_array_ptr_to_leaf(ptr), + index_key)) { pr_devel("replace in slot %d\n", i); edit->leaf_p = &node->slots[i]; edit->dead_leaf = node->slots[i]; From 090b7c197f61b24910bf468c136e0edcecd8d2df Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: [PATCH 306/420] UPSTREAM: block: fix use-after-free in seq file (cherry picked from commit 77da160530dd1dc94f6ae15a981f24e5f0021e84) I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe Change-Id: I07b33f4b38341f60a37806cdd45b0a0c3ab4d84d Bug: 30942273 --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index a60cfb2c946246..802f1daec9793f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -829,6 +829,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } From f077f3cde2c4649dd5b63bdd33ca7f7fd86aed2f Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Thu, 25 Aug 2016 17:36:46 -0700 Subject: [PATCH 307/420] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. This commit fixes the 32 bit build breakage in the previous commit. Tested on both 32 bit and 64 bit arm devices. Bug: 30677035 Change-Id: I06ecca8fe0e784a35738ee19e5ee7d860659e7a9 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 82 +++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 ++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 5 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 80 ++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 320 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0421b53e6431fe..db991eda91b6e1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -40,6 +40,8 @@ #include "blk-cgroup.h" #include "blk-mq.h" +#include + EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -3321,3 +3323,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + u_int64_t num_elem, elem; + int pct; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index ee68bf50b90b53..4dd3ad85f5c2b0 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -160,6 +160,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -546,6 +557,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1782,7 +1798,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2740,6 +2756,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 69664ec2276960..f33b50d1d726b6 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -30,8 +30,6 @@ #include "core.h" #include "host.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static void mmc_host_classdev_release(struct device *dev) { struct mmc_host *host = cls_dev_to_mmc_host(dev); @@ -558,6 +556,7 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif mmc_host_clk_sysfs_init(host); + mmc_latency_hist_sysfs_init(host); mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) @@ -587,6 +586,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index f2ab9e5781265c..4dcf49937ecd42 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,8 +12,13 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 605ca60e8a10da..64b1b2ba3a445b 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1313,6 +1314,16 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3051,6 +3062,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3074,6 +3086,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5248,6 +5276,54 @@ int ufshcd_shutdown(struct ufs_hba *hba) } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5263,6 +5339,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5552,6 +5629,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5566,6 +5645,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 4a574aa458557a..241810c8309927 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -473,6 +473,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aac0f9ea952ad5..a5ceda29ab1b46 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -209,6 +209,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1622,6 +1625,79 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index f206e29f94d72c..649ec17d48ae62 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -135,6 +135,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index bf2252a6fe3afd..d456d524ce65e6 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -384,6 +385,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From 4c87fe2c5664cd4bc1f8392b9e8e1994aec73d64 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 28 Jan 2016 09:22:44 -0200 Subject: [PATCH 308/420] UPSTREAM: [media] xc2028: avoid use after free (cherry picked from commit 8dfbcc4351a0b6d2f2d77f367552f48ffefafe18) If struct xc2028_config is passed without a firmware name, the following trouble may happen: [11009.907205] xc2028 5-0061: type set to XCeive xc2028/xc3028 tuner [11009.907491] ================================================================== [11009.907750] BUG: KASAN: use-after-free in strcmp+0x96/0xb0 at addr ffff8803bd78ab40 [11009.907992] Read of size 1 by task modprobe/28992 [11009.907994] ============================================================================= [11009.907997] BUG kmalloc-16 (Tainted: G W ): kasan: bad access detected [11009.907999] ----------------------------------------------------------------------------- [11009.908008] INFO: Allocated in xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] age=0 cpu=3 pid=28992 [11009.908012] ___slab_alloc+0x581/0x5b0 [11009.908014] __slab_alloc+0x51/0x90 [11009.908017] __kmalloc+0x27b/0x350 [11009.908022] xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] [11009.908026] usb_hcd_submit_urb+0x1e8/0x1c60 [11009.908029] usb_submit_urb+0xb0e/0x1200 [11009.908032] usb_serial_generic_write_start+0xb6/0x4c0 [11009.908035] usb_serial_generic_write+0x92/0xc0 [11009.908039] usb_console_write+0x38a/0x560 [11009.908045] call_console_drivers.constprop.14+0x1ee/0x2c0 [11009.908051] console_unlock+0x40d/0x900 [11009.908056] vprintk_emit+0x4b4/0x830 [11009.908061] vprintk_default+0x1f/0x30 [11009.908064] printk+0x99/0xb5 [11009.908067] kasan_report_error+0x10a/0x550 [11009.908070] __asan_report_load1_noabort+0x43/0x50 [11009.908074] INFO: Freed in xc2028_set_config+0x90/0x630 [tuner_xc2028] age=1 cpu=3 pid=28992 [11009.908077] __slab_free+0x2ec/0x460 [11009.908080] kfree+0x266/0x280 [11009.908083] xc2028_set_config+0x90/0x630 [tuner_xc2028] [11009.908086] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908090] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908094] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908098] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908101] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908105] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908108] do_one_initcall+0x141/0x300 [11009.908111] do_init_module+0x1d0/0x5ad [11009.908114] load_module+0x6666/0x9ba0 [11009.908117] SyS_finit_module+0x108/0x130 [11009.908120] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908123] INFO: Slab 0xffffea000ef5e280 objects=25 used=25 fp=0x (null) flags=0x2ffff8000004080 [11009.908126] INFO: Object 0xffff8803bd78ab40 @offset=2880 fp=0x0000000000000001 [11009.908130] Bytes b4 ffff8803bd78ab30: 01 00 00 00 2a 07 00 00 9d 28 00 00 01 00 00 00 ....*....(...... [11009.908133] Object ffff8803bd78ab40: 01 00 00 00 00 00 00 00 b0 1d c3 6a 00 88 ff ff ...........j.... [11009.908137] CPU: 3 PID: 28992 Comm: modprobe Tainted: G B W 4.5.0-rc1+ #43 [11009.908140] Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0350.2015.0812.1722 08/12/2015 [11009.908142] ffff8803bd78a000 ffff8802c273f1b8 ffffffff81932007 ffff8803c6407a80 [11009.908148] ffff8802c273f1e8 ffffffff81556759 ffff8803c6407a80 ffffea000ef5e280 [11009.908153] ffff8803bd78ab40 dffffc0000000000 ffff8802c273f210 ffffffff8155ccb4 [11009.908158] Call Trace: [11009.908162] [] dump_stack+0x4b/0x64 [11009.908165] [] print_trailer+0xf9/0x150 [11009.908168] [] object_err+0x34/0x40 [11009.908171] [] kasan_report_error+0x230/0x550 [11009.908175] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908179] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908182] [] __asan_report_load1_noabort+0x43/0x50 [11009.908185] [] ? __asan_register_globals+0x50/0xa0 [11009.908189] [] ? strcmp+0x96/0xb0 [11009.908192] [] strcmp+0x96/0xb0 [11009.908196] [] xc2028_set_config+0x15c/0x630 [tuner_xc2028] [11009.908200] [] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908203] [] ? memset+0x28/0x30 [11009.908206] [] ? xc2028_set_config+0x630/0x630 [tuner_xc2028] [11009.908211] [] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908215] [] ? em28xx_dvb_init.part.3+0x37c/0x5cf4 [em28xx_dvb] [11009.908219] [] ? hauppauge_hvr930c_init+0x487/0x487 [em28xx_dvb] [11009.908222] [] ? lgdt330x_attach+0x1cc/0x370 [lgdt330x] [11009.908226] [] ? i2c_read_demod_bytes.isra.2+0x210/0x210 [lgdt330x] [11009.908230] [] ? ref_module.part.15+0x10/0x10 [11009.908233] [] ? module_assert_mutex_or_preempt+0x80/0x80 [11009.908238] [] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908242] [] ? em28xx_attach_xc3028.constprop.7+0x30d/0x30d [em28xx_dvb] [11009.908245] [] ? string+0x14d/0x1f0 [11009.908249] [] ? symbol_string+0xff/0x1a0 [11009.908253] [] ? uuid_string+0x6f0/0x6f0 [11009.908257] [] ? __kernel_text_address+0x7e/0xa0 [11009.908260] [] ? print_context_stack+0x7f/0xf0 [11009.908264] [] ? __module_address+0xb6/0x360 [11009.908268] [] ? is_ftrace_trampoline+0x99/0xe0 [11009.908271] [] ? __kernel_text_address+0x7e/0xa0 [11009.908275] [] ? debug_check_no_locks_freed+0x290/0x290 [11009.908278] [] ? dump_trace+0x11b/0x300 [11009.908282] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908285] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908289] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908292] [] ? trace_hardirqs_on+0xd/0x10 [11009.908296] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908299] [] ? mutex_trylock+0x400/0x400 [11009.908302] [] ? do_one_initcall+0x131/0x300 [11009.908306] [] ? call_rcu_sched+0x17/0x20 [11009.908309] [] ? put_object+0x48/0x70 [11009.908314] [] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908317] [] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908320] [] ? 0xffffffffa0150000 [11009.908324] [] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908327] [] do_one_initcall+0x141/0x300 [11009.908330] [] ? try_to_run_init_process+0x40/0x40 [11009.908333] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908337] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908340] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908343] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908346] [] ? __asan_register_globals+0x87/0xa0 [11009.908350] [] do_init_module+0x1d0/0x5ad [11009.908353] [] load_module+0x6666/0x9ba0 [11009.908356] [] ? symbol_put_addr+0x50/0x50 [11009.908361] [] ? em28xx_dvb_init.part.3+0x5989/0x5cf4 [em28xx_dvb] [11009.908366] [] ? module_frob_arch_sections+0x20/0x20 [11009.908369] [] ? open_exec+0x50/0x50 [11009.908374] [] ? ns_capable+0x5b/0xd0 [11009.908377] [] SyS_finit_module+0x108/0x130 [11009.908379] [] ? SyS_init_module+0x1f0/0x1f0 [11009.908383] [] ? lockdep_sys_exit_thunk+0x12/0x14 [11009.908394] [] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908396] Memory state around the buggy address: [11009.908398] ffff8803bd78aa00: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908401] ffff8803bd78aa80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908403] >ffff8803bd78ab00: fc fc fc fc fc fc fc fc 00 00 fc fc fc fc fc fc [11009.908405] ^ [11009.908407] ffff8803bd78ab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908409] ffff8803bd78ac00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908411] ================================================================== In order to avoid it, let's set the cached value of the firmware name to NULL after freeing it. While here, return an error if the memory allocation fails. Signed-off-by: Mauro Carvalho Chehab Change-Id: I945c841dcfb45de2056267e4aa50bbe176b527cf Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index d12f5e4ad8bf42..36c7f2a077b48f 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1403,11 +1403,12 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) * in order to avoid troubles during device release. */ kfree(priv->ctrl.fname); + priv->ctrl.fname = NULL; memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); if (priv->ctrl.fname == NULL) - rc = -ENOMEM; + return -ENOMEM; } /* From b2c41d4849a5befc69699db5ea6b924f54c46df2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Feb 2016 13:34:00 -0200 Subject: [PATCH 309/420] UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config() (cherry picked from commit 210bd104c6acd31c3c6b8b075b3f12d4a9f6b60d) We have to unlock before returning -ENOMEM. Fixes: 8dfbcc4351a0 ('[media] xc2028: avoid use after free') Signed-off-by: Dan Carpenter Signed-off-by: Mauro Carvalho Chehab Change-Id: I7b6ba9fde5c6e29467e6de23d398af2fe56e2547 Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 36c7f2a077b48f..0b54ec2d6eeda6 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1407,8 +1407,10 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); - if (priv->ctrl.fname == NULL) - return -ENOMEM; + if (priv->ctrl.fname == NULL) { + rc = -ENOMEM; + goto unlock; + } } /* @@ -1440,6 +1442,7 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) } else priv->state = XC2028_WAITING_FIRMWARE; } +unlock: mutex_unlock(&priv->lock); return rc; From f8af3641cf7c22cd07771761f609dece21cf788d Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 May 2016 16:22:26 -0700 Subject: [PATCH 310/420] UPSTREAM: proc: prevent accessing /proc//environ until it's ready (cherry picked from commit 8148a73c9901a8794a50f950083c00ccf97d43b3) If /proc//environ gets read before the envp[] array is fully set up in create_{aout,elf,elf_fdpic,flat}_tables(), we might end up trying to read more bytes than are actually written, as env_start will already be set but env_end will still be zero, making the range calculation underflow, allowing to read beyond the end of what has been written. Fix this as it is done for /proc//cmdline by testing env_end for zero. It is, apparently, intentionally set last in create_*_tables(). This bug was found by the PaX size_overflow plugin that detected the arithmetic underflow of 'this_len = env_end - (env_start + src)' when env_end is still zero. The expected consequence is that userland trying to access /proc//environ of a not yet fully set up process may get inconsistent data as we're in the middle of copying in the environment variables. Fixes: https://forums.grsecurity.net/viewtopic.php?f=3&t=4363 Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=116461 Signed-off-by: Mathias Krause Cc: Emese Revfy Cc: Pax Team Cc: Al Viro Cc: Mateusz Guzik Cc: Alexey Dobriyan Cc: Cyrill Gorcunov Cc: Jarod Wilson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ia2f58d48c15478ed4b6e237b63e704c70ff21e96 Bug: 30951939 --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c952a094ccc09..9c9e25ae9702ff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -752,7 +752,8 @@ static ssize_t environ_read(struct file *file, char __user *buf, int ret = 0; struct mm_struct *mm = file->private_data; - if (!mm) + /* Ensure the process spawned far enough to have an environment. */ + if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_TEMPORARY); From 203aea27ebc9db14a7b2e984beb4406b62e77753 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:02:29 +0100 Subject: [PATCH 311/420] FIXUP: sched/tune: add fixes missing from a previous patch The previous patch: e7ce26f - FIXUP: sched/tune: fix accounting for runnable tasks squashed together patches of a series to fix SchedTune's accounting issues. However, in the consolidation and cleanup of the series to merge in the Android Common Kernel, we somehow missed a couple of important changes: 1) the schedtune_exit function is not more required, because e7ce26f fixes accounting of exiting tasks in a different way 2) the schedtune_initialized flag was not set at the end of scheddtune_init_cgroup() thus failing to enabled SchedTune at boot. This patch thus is to be considered an integration of e7ce26f. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index e795c828a72818..a41d32934fe5af 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -717,21 +717,9 @@ schedtune_css_free(struct cgroup_subsys_state *css) kfree(st); } -static void -schedtune_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *tsk) -{ - struct schedtune *old_st = css_st(old_css); - int cpu = task_cpu(tsk); - - schedtune_tasks_update(tsk, cpu, old_st->idx, -1); -} - struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, - .exit = schedtune_exit, .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, @@ -753,6 +741,8 @@ schedtune_init_cgroups(void) pr_info("schedtune: configured to support %d boost groups\n", BOOSTGROUPS_COUNT); + + schedtune_initialized = true; } #else /* CONFIG_CGROUP_SCHEDTUNE */ From 65ae3094e1c33dc0764ac11953dfbcdcdfc622d4 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:27:27 +0100 Subject: [PATCH 312/420] FIXUP: sched/tune: update accouting before CPU capacity The SchedTune tasks accounting is used to identify how many tasks are in a boostgroup and thus to bias the selection of an OPP based on the maximum boost value of the active boostgroups. The current implementation however update the accounting after CPU capacity has been update. This has two effects: a) when we enqueue a boosted task, we do not immediately boost its CPU b) when we dequeue a boosted task, we can keep a CPU boosted even if not required This patch change the order of the SchedTune accounting and SchedFreq updated to ensure to have always an updated representation of which boosted tasks are runnable on a CPU before updating its capacity. Reported-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 491ef61a6dd5d9..f739978a0e3ce5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4072,6 +4072,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && @@ -4091,9 +4110,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_capacity_of(cpu_of(rq)); } - /* Update SchedTune accouting */ - schedtune_enqueue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4159,6 +4175,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + if (!se) { walt_dec_cumulative_runnable_avg(rq, p); @@ -4178,9 +4203,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } - /* Update SchedTune accouting */ - schedtune_dequeue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); From 48de8fab49b4dadd76e0954ed6bbab3bcb706a0e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 313/420] ext4 crypto: add ext4_mpage_readpages() This takes code from fs/mpage.c and optimizes it for ext4. Its primary reason is to allow us to more easily add encryption to ext4's read path in an efficient manner. Change-Id: I1fd07e78fbbff50fd4028bbffbee73dbaec546a1 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/ext4.h | 4 + fs/ext4/inode.c | 4 +- fs/ext4/readpage.c | 264 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 271 insertions(+), 3 deletions(-) create mode 100644 fs/ext4/readpage.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 0310fec2ee3dbd..cd6f50fce27817 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o + xattr_trusted.o inline.o readpage.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a2f44223d34dd1..8fe1b0f8836abd 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2689,6 +2689,10 @@ static inline void ext4_set_de_type(struct super_block *sb, de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3356ab5395f469..31f7db99ac9ef2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2821,7 +2821,7 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return mpage_readpage(page, ext4_get_block); + return ext4_mpage_readpages(page->mapping, NULL, page, 1); return ret; } @@ -2836,7 +2836,7 @@ ext4_readpages(struct file *file, struct address_space *mapping, if (ext4_has_inline_data(inode)) return 0; - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); + return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); } static void ext4_invalidatepage(struct page *page, unsigned int offset, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c new file mode 100644 index 00000000000000..fff9fe6aacf85a --- /dev/null +++ b/fs/ext4/readpage.c @@ -0,0 +1,264 @@ +/* + * linux/fs/ext4/readpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2015, Google, Inc. + * + * This was originally taken from fs/mpage.c + * + * The intent is the ext4_mpage_readpages() function here is intended + * to replace mpage_readpages() in the general case, not just for + * encrypted files. It has some limitations (see below), where it + * will fall back to read_block_full_page(), but these limitations + * should only be hit when page_size != block_size. + * + * This will allow us to attach a callback function to support ext4 + * encryption. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_page(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio, int err) +{ + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + struct block_device *bdev = inode->i_sb->s_bdev; + int length; + unsigned relative_block = 0; + struct ext4_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + int fully_mapped = 1; + unsigned first_hole = blocks_per_page; + + prefetchw(&page->flags); + if (pages) { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) + goto next_page; + } + + if (page_has_buffers(page)) + goto confused; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & EXT4_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) { + unsigned map_offset = block_in_file - map.m_lblk; + unsigned last = map.m_len - map_offset; + + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } + if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk + map_offset + + relative_block; + page_block++; + block_in_file++; + } + } + + /* + * Then do more ext4_map_blocks() calls until we are + * done with this page. + */ + while (page_block < blocks_per_page) { + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { + set_error_page: + SetPageError(page); + zero_user_segment(page, 0, + PAGE_CACHE_SIZE); + unlock_page(page); + goto next_page; + } + } + if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { + fully_mapped = 0; + if (first_hole == blocks_per_page) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + if (first_hole != blocks_per_page) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (page_block && blocks[page_block-1] != map.m_pblk-1) + goto confused; + for (relative_block = 0; ; relative_block++) { + if (relative_block == map.m_len) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } else if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk+relative_block; + page_block++; + block_in_file++; + } + } + if (first_hole != blocks_per_page) { + zero_user_segment(page, first_hole << blkbits, + PAGE_CACHE_SIZE); + if (first_hole == 0) { + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + } else if (fully_mapped) { + SetPageMappedToDisk(page); + } + if (fully_mapped && blocks_per_page == 1 && + !PageUptodate(page) && cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != blocks[0] - 1)) { + submit_and_realloc: + submit_bio(READ, bio); + bio = NULL; + } + if (bio == NULL) { + bio = bio_alloc(GFP_KERNEL, + min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + if (!bio) + goto set_error_page; + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_end_io = mpage_end_io; + } + + length = first_hole << blkbits; + if (bio_add_page(bio, page, length, 0) < length) + goto submit_and_realloc; + + if (((map.m_flags & EXT4_MAP_BOUNDARY) && + (relative_block == map.m_len)) || + (first_hole != blocks_per_page)) { + submit_bio(READ, bio); + bio = NULL; + } else + last_block_in_bio = blocks[blocks_per_page - 1]; + goto next_page; + confused: + if (bio) { + submit_bio(READ, bio); + bio = NULL; + } + if (!PageUptodate(page)) + block_read_full_page(page, ext4_get_block); + else + unlock_page(page); + next_page: + if (pages) + page_cache_release(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + submit_bio(READ, bio); + return 0; +} From 31fe70ea8285e1db8bba79d9a9080c42b91e4b8b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 314/420] ext4 crypto: reserve codepoints used by the ext4 encryption feature Change-Id: I3cc7669ce5c2902bacf9ec365b1ba7049be781f0 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8fe1b0f8836abd..7a81310afddf8d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -373,7 +373,8 @@ struct flex_groups { #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ @@ -426,11 +427,11 @@ enum { EXT4_INODE_APPEND = 5, /* writes to file may only append */ EXT4_INODE_NODUMP = 6, /* do not dump file */ EXT4_INODE_NOATIME = 7, /* do not update atime */ -/* Reserved for compression usage... */ +/* Reserved for compression usage, co-opted for encryption usage */ EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ECOMPR = 11, /* Compression error */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ @@ -475,7 +476,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); - CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); @@ -592,6 +593,13 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 +#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 + /* * ioctl commands */ @@ -1135,7 +1143,8 @@ struct ext4_super_block { __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ - __le16 s_reserved_pad; + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ @@ -1161,7 +1170,10 @@ struct ext4_super_block { __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ - __le32 s_reserved[106]; /* Padding to the end of the block */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_reserved[100]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1541,6 +1553,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ From 6e8b0597dfc43b9ce60a93b38dcbc97bb1d1aa72 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 315/420] ext4 crypto: add ext4 encryption Kconfig Change-Id: I784c5f57f031981e5d28796921f5e587d4f72422 Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index efea5d5c44ce4e..18228c201f7f4c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -64,6 +64,23 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. +config EXT4_FS_ENCRYPTION + bool "Ext4 Encryption" + depends on EXT4_FS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of ext4 files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + config EXT4_DEBUG bool "EXT4 debugging support" depends on EXT4_FS From 0c6cc2577b2aac9d051a19e570a0f1630ee21203 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 316/420] ext4 crypto: export ext4_empty_dir() Required for future encryption xattr changes. Change-Id: Ieaff30ae755d76f562c6c4b110bc0c1c59ea4dfd Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/namei.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7a81310afddf8d..bc1ef0683ddddb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2192,6 +2192,7 @@ extern int ext4_generic_delete_entry(handle_t *handle, void *entry_buf, int buf_size, int csum_size); +extern int ext4_empty_dir(struct inode *inode); /* resize.c */ extern int ext4_group_add(struct super_block *sb, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 426211882f7274..55dc956fac0254 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2450,7 +2450,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) /* * routine to check that the specified directory is empty (for rmdir) */ -static int empty_dir(struct inode *inode) +int ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; @@ -2718,7 +2718,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; retval = -ENOTEMPTY; - if (!empty_dir(inode)) + if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, @@ -3273,7 +3273,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; - if (!empty_dir(new.inode)) + if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; @@ -3347,8 +3347,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_dec_count(handle, old.dir); if (new.inode) { - /* checked empty_dir above, can't have another parent, - * ext4_dec_count() won't work for many-linked dirs */ + /* checked ext4_empty_dir above, can't have another + * parent, ext4_dec_count() won't work for many-linked + * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(handle, new.dir); From 2310bed5931debb201b99bba4b08d4f4f9145a63 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 317/420] ext4 crypto: add encryption xattr support Change-Id: I6325cdbfb9666cca194b462878c157bd0449302e Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 29bedf5589f6b2..ddc0957760ba00 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -23,6 +23,7 @@ #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 +#define EXT4_XATTR_INDEX_ENCRYPTION 9 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; +#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); From 797e2ec0d31073cc0afeb517844c9e2810ff0f24 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 318/420] ext4 crypto: add encryption policy and password salt support Change-Id: Iaabf029d28db2bd27e492e4e1bf7adc034b066e8 Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: Ildar Muslukhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 1 + fs/ext4/crypto_policy.c | 167 ++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 15 ++++ fs/ext4/ext4_crypto.h | 49 ++++++++++++ fs/ext4/ioctl.c | 85 ++++++++++++++++++++ 5 files changed, 317 insertions(+) create mode 100644 fs/ext4/crypto_policy.c create mode 100644 fs/ext4/ext4_crypto.h diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index cd6f50fce27817..3886ee45f556b0 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -12,3 +12,4 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c new file mode 100644 index 00000000000000..532b69c0afab51 --- /dev/null +++ b/fs/ext4/crypto_policy.c @@ -0,0 +1,167 @@ +/* + * linux/fs/ext4/crypto_policy.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption policy functions for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#include +#include +#include + +#include "ext4.h" +#include "xattr.h" + +static int ext4_inode_has_encryption_context(struct inode *inode) +{ + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); + return (res > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int ext4_is_encryption_context_consistent_with_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int ext4_create_encryption_context_from_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = 0; + + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); + if (!res) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + return res; +} + +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode) +{ + if (policy->version != 0) + return -EINVAL; + + if (!ext4_inode_has_encryption_context(inode)) { + if (!ext4_empty_dir(inode)) + return -ENOTEMPTY; + return ext4_create_encryption_context_from_policy(inode, + policy); + } + + if (ext4_is_encryption_context_consistent_with_policy(inode, policy)) + return 0; + + printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", + __func__); + return -EINVAL; +} + +int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENOENT; + if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + return 0; +} + +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child) +{ + struct ext4_encryption_context parent_ctx, child_ctx; + int res; + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); + BUG_ON(1); + } + /* no restrictions if the parent directory is not encrypted */ + if (!ext4_encrypted_inode(parent)) + return 1; + res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) + return 0; + /* if the child directory is not encrypted, this is always a problem */ + if (!ext4_encrypted_inode(child)) + return 0; + res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + return (memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode)); +} + +/** + * ext4_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * + * Return: Zero on success, non-zero otherwise + */ +int ext4_inherit_context(struct inode *parent, struct inode *child) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + + if (res != sizeof(ctx)) + return -ENOENT; + + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); + if (!res) + ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + return res; +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bc1ef0683ddddb..865be1e013f452 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -600,6 +600,8 @@ enum { #define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 #define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 +#include "ext4_crypto.h" + /* * ioctl commands */ @@ -621,6 +623,9 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) +#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -2003,6 +2008,16 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); +/* crypto_policy.c */ +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child); +int ext4_inherit_context(struct inode *parent, struct inode *child); +void ext4_to_hex(char *dst, char *src, size_t src_size); +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode); +int ext4_get_policy(struct inode *inode, + struct ext4_encryption_policy *policy); + /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h new file mode 100644 index 00000000000000..a69d2ba54bee7e --- /dev/null +++ b/fs/ext4/ext4_crypto.h @@ -0,0 +1,49 @@ +/* + * linux/fs/ext4/ext4_crypto.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption header content for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#ifndef _EXT4_CRYPTO_H +#define _EXT4_CRYPTO_H + +#include + +#define EXT4_KEY_DESCRIPTOR_SIZE 8 + +/* Policy provided via an ioctl on the topmost directory */ +struct ext4_encryption_policy { + char version; + char contents_encryption_mode; + char filenames_encryption_mode; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; +} __attribute__((__packed__)); + +#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 +#define EXT4_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Reserved + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct ext4_encryption_context { + char format; + char contents_encryption_mode; + char filenames_encryption_mode; + char reserved; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; + char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; +} __attribute__((__packed__)); + +#endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e75462a41d17c6..21d077c028fecc 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "ext4_jbd2.h" #include "ext4.h" @@ -198,6 +199,16 @@ static long swap_inode_boot_loader(struct super_block *sb, return err; } +static int uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return 0; + return 1; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -621,7 +632,78 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); + case EXT4_IOC_SET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (copy_from_user(&policy, + (struct ext4_encryption_policy __user *)arg, + sizeof(policy))) { + err = -EFAULT; + goto encryption_policy_out; + } + err = ext4_process_policy(&policy, inode); +encryption_policy_out: + return err; +#else + return -EOPNOTSUPP; +#endif + } + case EXT4_IOC_GET_ENCRYPTION_PWSALT: { + int err, err2; + struct ext4_sb_info *sbi = EXT4_SB(sb); + handle_t *handle; + + if (!ext4_sb_has_crypto(sb)) + return -EOPNOTSUPP; + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto pwsalt_err_journal; + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + err = ext4_handle_dirty_metadata(handle, NULL, + sbi->s_sbh); + pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, + 16)) + return -EFAULT; + return 0; + } + case EXT4_IOC_GET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (!ext4_encrypted_inode(inode)) + return -ENOENT; + err = ext4_get_policy(inode, &policy); + if (err) + return err; + if (copy_to_user((void *)arg, &policy, sizeof(policy))) + return -EFAULT; + return 0; +#else + return -EOPNOTSUPP; +#endif + } default: return -ENOTTY; } @@ -686,6 +768,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FITRIM: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: + case EXT4_IOC_SET_ENCRYPTION_POLICY: + case EXT4_IOC_GET_ENCRYPTION_PWSALT: + case EXT4_IOC_GET_ENCRYPTION_POLICY: break; default: return -ENOIOCTLCMD; From bd425361b418a9170ad276124506109f1117b01d Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 319/420] ext4 crypto: add ext4 encryption facilities On encrypt, we will re-assign the buffer_heads to point to a bounce page rather than the control_page (which is the original page to write that contains the plaintext). The block I/O occurs against the bounce page. On write completion, we re-assign the buffer_heads to the original plaintext page. On decrypt, we will attach a read completion callback to the bio struct. This read completion will decrypt the read contents in-place prior to setting the page up-to-date. The current encryption mode, AES-256-XTS, lacks cryptographic integrity. AES-256-GCM is in-plan, but we will need to devise a mechanism for handling the integrity data. Change-Id: Icf3814a88aed38f24bf615663f9921f5c390fb32 Signed-off-by: Michael Halcrow Signed-off-by: Ildar Muslukhov Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/crypto.c | 558 ++++++++++++++++++++++++++++++++++++++++ fs/ext4/crypto_policy.c | 8 + fs/ext4/ext4.h | 52 ++++ fs/ext4/ext4_crypto.h | 55 ++++ fs/ext4/super.c | 8 + 6 files changed, 682 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/crypto.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 3886ee45f556b0..1b1c5619523d3e 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -12,4 +12,4 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c new file mode 100644 index 00000000000000..8ff15273ab0cc0 --- /dev/null +++ b/fs/ext4/crypto.c @@ -0,0 +1,558 @@ +/* + * linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption functions for ext4 + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_extents.h" +#include "xattr.h" + +/* Encryption added and removed here! (L: */ + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *ext4_bounce_page_pool; + +static LIST_HEAD(ext4_free_crypto_ctxs); +static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); + +/** + * ext4_release_crypto_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) +{ + unsigned long flags; + + if (ctx->bounce_page) { + if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) + __free_page(ctx->bounce_page); + else + mempool_free(ctx->bounce_page, ext4_bounce_page_pool); + ctx->bounce_page = NULL; + } + ctx->control_page = NULL; + if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { + if (ctx->tfm) + crypto_free_tfm(ctx->tfm); + kfree(ctx); + } else { + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + } +} + +/** + * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context + * @mask: The allocation mask. + * + * Return: An allocated and initialized encryption context on success. An error + * value or NULL otherwise. + */ +static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) +{ + struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), + mask); + + if (!ctx) + return ERR_PTR(-ENOMEM); + return ctx; +} + +/** + * ext4_get_crypto_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) +{ + struct ext4_crypto_ctx *ctx = NULL; + int res = 0; + unsigned long flags; + struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + + if (!ext4_read_workqueue) + ext4_init_crypto(); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs, + struct ext4_crypto_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + if (!ctx) { + ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + + /* Allocate a new Crypto API context if we don't already have + * one or if it isn't the right mode. */ + BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); + if (ctx->tfm && (ctx->mode != key->mode)) { + crypto_free_tfm(ctx->tfm); + ctx->tfm = NULL; + ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; + } + if (!ctx->tfm) { + switch (key->mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + ctx->tfm = crypto_ablkcipher_tfm( + crypto_alloc_ablkcipher("xts(aes)", 0, 0)); + break; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + /* TODO(mhalcrow): AEAD w/ gcm(aes); + * crypto_aead_setauthsize() */ + ctx->tfm = ERR_PTR(-ENOTSUPP); + break; + default: + BUG(); + } + if (IS_ERR_OR_NULL(ctx->tfm)) { + res = PTR_ERR(ctx->tfm); + ctx->tfm = NULL; + goto out; + } + ctx->mode = key->mode; + } + BUG_ON(key->size != ext4_encryption_key_size(key->mode)); + + /* There shouldn't be a bounce page attached to the crypto + * context at this point. */ + BUG_ON(ctx->bounce_page); + +out: + if (res) { + if (!IS_ERR_OR_NULL(ctx)) + ext4_release_crypto_ctx(ctx); + ctx = ERR_PTR(res); + } + return ctx; +} + +struct workqueue_struct *ext4_read_workqueue; +static DEFINE_MUTEX(crypto_init); + +/** + * ext4_exit_crypto() - Shutdown the ext4 encryption system + */ +void ext4_exit_crypto(void) +{ + struct ext4_crypto_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { + if (pos->bounce_page) { + if (pos->flags & + EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { + __free_page(pos->bounce_page); + } else { + mempool_free(pos->bounce_page, + ext4_bounce_page_pool); + } + } + if (pos->tfm) + crypto_free_tfm(pos->tfm); + kfree(pos); + } + INIT_LIST_HEAD(&ext4_free_crypto_ctxs); + if (ext4_bounce_page_pool) + mempool_destroy(ext4_bounce_page_pool); + ext4_bounce_page_pool = NULL; + if (ext4_read_workqueue) + destroy_workqueue(ext4_read_workqueue); + ext4_read_workqueue = NULL; +} + +/** + * ext4_init_crypto() - Set up for ext4 encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_init_crypto(void) +{ + int i, res; + + mutex_lock(&crypto_init); + if (ext4_read_workqueue) + goto already_initialized; + ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); + if (!ext4_read_workqueue) { + res = -ENOMEM; + goto fail; + } + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct ext4_crypto_ctx *ctx; + + ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto fail; + } + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + } + + ext4_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!ext4_bounce_page_pool) { + res = -ENOMEM; + goto fail; + } +already_initialized: + mutex_unlock(&crypto_init); + return 0; +fail: + ext4_exit_crypto(); + mutex_unlock(&crypto_init); + return res; +} + +void ext4_restore_control_page(struct page *data_page) +{ + struct ext4_crypto_ctx *ctx = + (struct ext4_crypto_ctx *)page_private(data_page); + + set_page_private(data_page, (unsigned long)NULL); + ClearPagePrivate(data_page); + unlock_page(data_page); + ext4_release_crypto_ctx(ctx); +} + +/** + * ext4_crypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void ext4_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + EXT4_DECRYPT = 0, + EXT4_ENCRYPT, +} ext4_direction_t; + +static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, + struct inode *inode, + ext4_direction_t rw, + pgoff_t index, + struct page *src_page, + struct page *dest_page) + +{ + u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct ext4_inode_info *ei = EXT4_I(inode); + struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + int res = 0; + + BUG_ON(!ctx->tfm); + BUG_ON(ctx->mode != ei->i_encryption_key.mode); + + if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { + printk_ratelimited(KERN_ERR + "%s: unsupported crypto algorithm: %d\n", + __func__, ctx->mode); + return -ENOTSUPP; + } + + crypto_ablkcipher_clear_flags(atfm, ~0); + crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + + res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, + ei->i_encryption_key.size); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_ablkcipher_setkey() failed\n", + __func__); + return res; + } + req = ablkcipher_request_alloc(atfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_crypt_complete, &ecr); + + BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + EXT4_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); + ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); + if (rw == EXT4_DECRYPT) + res = crypto_ablkcipher_decrypt(req); + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res) { + printk_ratelimited( + KERN_ERR + "%s: crypto_ablkcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +/** + * ext4_encrypt() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * ext4_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return (struct page *) ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_page(GFP_NOFS); + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (WARN_ON_ONCE(!ciphertext_page)) { + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS | __GFP_WAIT); + } + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->bounce_page = ciphertext_page; + ctx->control_page = plaintext_page; + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page); + if (err) { + ext4_release_crypto_ctx(ctx); + return ERR_PTR(err); + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; +} + +/** + * ext4_decrypt() - Decrypts a page in-place + * @ctx: The encryption context. + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return ext4_page_crypto(ctx, page->mapping->host, + EXT4_DECRYPT, page->index, page, page); +} + +/* + * Convenience function which takes care of allocating and + * deallocating the encryption context + */ +int ext4_decrypt_one(struct inode *inode, struct page *page) +{ + int ret; + + struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); + + if (!ctx) + return -ENOMEM; + ret = ext4_decrypt(ctx, page); + ext4_release_crypto_ctx(ctx); + return ret; +} + +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + ext4_lblk_t lblk = ex->ee_block; + ext4_fsblk_t pblk = ext4_ext_pblock(ex); + unsigned int len = ext4_ext_get_actual_len(ex); + int err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_page(GFP_NOFS); + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (WARN_ON_ONCE(!ciphertext_page)) { + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS | __GFP_WAIT); + } + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->bounce_page = ciphertext_page; + + while (len--) { + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page); + if (err) + goto errout; + + bio = bio_alloc(GFP_KERNEL, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = pblk; + err = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (err) { + bio_put(bio); + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if (err) + goto errout; + } + err = 0; +errout: + ext4_release_crypto_ctx(ctx); + return err; +} + +bool ext4_valid_contents_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS); +} + +/** + * ext4_validate_encryption_key_size() - Validate the encryption key size + * @mode: The key mode. + * @size: The key size to validate. + * + * Return: The validated key size for @mode. Zero if invalid. + */ +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) +{ + if (size == ext4_encryption_key_size(mode)) + return size; + return 0; +} diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 532b69c0afab51..a4bf762b3ba946 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -52,6 +52,13 @@ static int ext4_create_encryption_context_from_policy( ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); + if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + res = -EINVAL; + goto out; + } ctx.contents_encryption_mode = policy->contents_encryption_mode; ctx.filenames_encryption_mode = policy->filenames_encryption_mode; BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); @@ -60,6 +67,7 @@ static int ext4_create_encryption_context_from_policy( res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); +out: if (!res) ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); return res; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 865be1e013f452..f6d85272288893 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -956,6 +956,11 @@ struct ext4_inode_info { /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption params */ + struct ext4_encryption_key i_encryption_key; +#endif }; /* @@ -1359,6 +1364,12 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption */ + uint32_t s_file_encryption_mode; + uint32_t s_dir_encryption_mode; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1474,6 +1485,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_SB(sb) (sb) #endif +/* + * Returns true if the inode is inode is encrypted + */ +static inline int ext4_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); +#else + return 0; +#endif +} + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -2018,6 +2041,35 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy, int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy); +/* crypto.c */ +bool ext4_valid_contents_enc_mode(uint32_t mode); +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); +extern struct workqueue_struct *ext4_read_workqueue; +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode); +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); +void ext4_restore_control_page(struct page *data_page); +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page); +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); +int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_init_crypto(void); +void ext4_exit_crypto(void); +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); +} +#else +static inline int ext4_init_crypto(void) { return 0; } +static inline void ext4_exit_crypto(void) { } +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return 0; +} +#endif + /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index a69d2ba54bee7e..9d5d2e56cc4696 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -46,4 +46,59 @@ struct ext4_encryption_context { char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; } __attribute__((__packed__)); +/* Encryption parameters */ +#define EXT4_XTS_TWEAK_SIZE 16 +#define EXT4_AES_128_ECB_KEY_SIZE 16 +#define EXT4_AES_256_GCM_KEY_SIZE 32 +#define EXT4_AES_256_CBC_KEY_SIZE 32 +#define EXT4_AES_256_CTS_KEY_SIZE 32 +#define EXT4_AES_256_XTS_KEY_SIZE 64 +#define EXT4_MAX_KEY_SIZE 64 + +struct ext4_encryption_key { + uint32_t mode; + char raw[EXT4_MAX_KEY_SIZE]; + uint32_t size; +}; + +#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 + +struct ext4_crypto_ctx { + struct crypto_tfm *tfm; /* Crypto API context */ + struct page *bounce_page; /* Ciphertext page on write path */ + struct page *control_page; /* Original page on write path */ + struct bio *bio; /* The bio for this context */ + struct work_struct work; /* Work queue for read complete path */ + struct list_head free_list; /* Free list */ + int flags; /* Flags */ + int mode; /* Encryption mode for tfm */ +}; + +struct ext4_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \ + struct ext4_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int ext4_encryption_key_size(int mode) +{ + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + return EXT4_AES_256_XTS_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + return EXT4_AES_256_GCM_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CBC: + return EXT4_AES_256_CBC_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + return EXT4_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + #endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 03e0b5c8ec14f4..66757f0193e437 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -903,6 +903,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; +#endif return &ei->vfs_inode; } @@ -3435,6 +3438,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Modes of operations for file and directory encryption. */ + sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; +#endif /* Cleanup superblock name */ for (cp = sb->s_id; (cp = strchr(cp, '/'));) From 7da3c79bfa9afb1b987f9ac1bb9b3cfcb228e244 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 320/420] ext4 crypto: add encryption key management facilities Change-Id: I4914284877331994b0d1f701bcbbcf820116e8ee Signed-off-by: Michael Halcrow Signed-off-by: Ildar Muslukhov Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/crypto_key.c | 162 ++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 13 ++++ fs/ext4/ext4_crypto.h | 3 + 4 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/crypto_key.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 1b1c5619523d3e..4e5af21f105065 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -12,4 +12,4 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o crypto_key.o diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c new file mode 100644 index 00000000000000..572bd97f58dd60 --- /dev/null +++ b/fs/ext4/crypto_key.c @@ -0,0 +1,162 @@ +/* + * linux/fs/ext4/crypto_key.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions for ext4 + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "xattr.h" + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct ext4_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * ext4_derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivatio. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], + char source_key[EXT4_AES_256_XTS_KEY_SIZE], + char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, + 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_ablkcipher_setkey(tfm, deriving_key, + EXT4_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + +out: + if (req) + ablkcipher_request_free(req); + if (tfm) + crypto_free_ablkcipher(tfm); + return res; +} + +/** + * ext4_generate_encryption_key() - generates an encryption key + * @inode: The inode to generate the encryption key for. + */ +int ext4_generate_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; + struct key *keyring_key = NULL; + struct ext4_encryption_key *master_key; + struct ext4_encryption_context ctx; + struct user_key_payload *ukp; + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + + if (res != sizeof(ctx)) { + if (res > 0) + res = -EINVAL; + goto out; + } + res = 0; + + memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, + EXT4_KEY_DESC_PREFIX_SIZE); + sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, + "%*phN", EXT4_KEY_DESCRIPTOR_SIZE, + ctx.master_key_descriptor); + full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + if (IS_ERR(keyring_key)) { + res = PTR_ERR(keyring_key); + keyring_key = NULL; + goto out; + } + BUG_ON(keyring_key->type != &key_type_logon); + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct ext4_encryption_key)) { + res = -EINVAL; + goto out; + } + master_key = (struct ext4_encryption_key *)ukp->data; + + if (S_ISREG(inode->i_mode)) + crypt_key->mode = ctx.contents_encryption_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + crypt_key->mode = ctx.filenames_encryption_mode; + else { + printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + BUG(); + } + crypt_key->size = ext4_encryption_key_size(crypt_key->mode); + BUG_ON(!crypt_key->size); + BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != + EXT4_KEY_DERIVATION_NONCE_SIZE); + BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + BUG_ON(crypt_key->size < EXT4_AES_256_CBC_KEY_SIZE); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); +out: + if (keyring_key) + key_put(keyring_key); + if (res < 0) + crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + return res; +} + +int ext4_has_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + + return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f6d85272288893..61eb6b533c1c4e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2070,6 +2070,19 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) } #endif +/* crypto_key.c */ +int ext4_generate_encryption_key(struct inode *inode); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_has_encryption_key(struct inode *inode); +#else +static inline int ext4_has_encryption_key(struct inode *inode) +{ + return 0; +} +#endif + + /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 9d5d2e56cc4696..6a7c0c06b2be65 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -55,6 +55,9 @@ struct ext4_encryption_context { #define EXT4_AES_256_XTS_KEY_SIZE 64 #define EXT4_MAX_KEY_SIZE 64 +#define EXT4_KEY_DESC_PREFIX "ext4:" +#define EXT4_KEY_DESC_PREFIX_SIZE 5 + struct ext4_encryption_key { uint32_t mode; char raw[EXT4_MAX_KEY_SIZE]; From bc3365adb72d76416d227171d632deb2126e5e9b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 321/420] ext4 crypto: enforce context consistency Enforce the following inheritance policy: 1) An unencrypted directory may contain encrypted or unencrypted files or directories. 2) All files or directories in a directory must be protected using the same key as their containing directory. As a result, assuming the following setup: mke2fs -t ext4 -Fq -O encrypt /dev/vdc mount -t ext4 /dev/vdc /vdc mkdir /vdc/a /vdc/b /vdc/c echo foo | e4crypt add_key /vdc/a echo bar | e4crypt add_key /vdc/b for i in a b c ; do cp /etc/motd /vdc/$i/motd-$i ; done Then we will see the following results: cd /vdc mv a b # will fail; /vdc/a and /vdc/b have different keys mv b/motd-b a # will fail, see above ln a/motd-a b # will fail, see above mv c a # will fail; all inodes in an encrypted directory # must be encrypted ln c/motd-c b # will fail, see above mv a/motd-a c # will succeed mv c/motd-a a # will succeed Change-Id: I0eb702a4e5c426dfd38863ada7bdec3741e1ee8b Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 55dc956fac0254..0257713685a509 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1417,6 +1417,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ino); return ERR_PTR(-EIO); } + if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + !ext4_is_child_context_consistent_with_parent(dir, + inode)) { + iput(inode); + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return ERR_PTR(-EPERM); + } } return d_splice_alias(inode, dentry); } @@ -2938,7 +2950,9 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; - + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) + return -EPERM; dquot_initialize(dir); retry: @@ -3239,6 +3253,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; + if ((old.dir != new.dir) && + ext4_encrypted_inode(new.dir) && + !ext4_is_child_context_consistent_with_parent(new.dir, + old.inode)) { + retval = -EPERM; + goto end_rename; + } + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { From 40d39f5cb13460581164e3653ff36faefeb6699e Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 322/420] ext4 crypto: inherit encryption policies on inode and directory create Change-Id: Ic06753bdc015fc12b4f5620dfc15955765b1f117 Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0257713685a509..f2fab7a01dd5a3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2249,7 +2249,19 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); + err = 0; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!err && ext4_encrypted_inode(dir)) { + err = ext4_inherit_context(dir, inode); + if (err) { + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + } + } +#endif + if (!err) + err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); } @@ -2430,6 +2442,13 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(dir)) { + err = ext4_inherit_context(dir, inode); + if (err) + goto out_clear_inode; + } +#endif err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); From 14d7c22120f906861387fd86d19e87a720340c73 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 323/420] ext4 crypto: implement the ext4 encryption write path Pulls block_write_begin() into fs/ext4/inode.c because it might need to do a low-level read of the existing data, in which case we need to decrypt it. Change-Id: Ib2067d50cb80e9017ccf7016b2e72683ebd4c74a Signed-off-by: Michael Halcrow Signed-off-by: Ildar Muslukhov Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 17 +++++++ fs/ext4/ialloc.c | 5 +++ fs/ext4/inode.c | 112 +++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/page-io.c | 45 ++++++++++++++++--- 4 files changed, 173 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0b16fb4c06d3e1..e45b650d70753d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3111,6 +3111,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, ex); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); if (ret > 0) ret = 0; @@ -4911,6 +4914,20 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; + /* + * Encrypted inodes can't handle collapse range or insert + * range since we would need to re-encrypt blocks with a + * different IV or XTS tweak (which are based on the logical + * block number). + * + * XXX It's not clear why zero range isn't working, but we'll + * leave it disabled for encrypted inodes for now. This is a + * bug we should fix.... + */ + if (ext4_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))) + return -EOPNOTSUPP; + /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ac644c31ca6747..e554ca344047c3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -997,6 +997,11 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei->i_block_group = group; ei->i_last_alloc_group = ~0; + /* If the directory encrypted, then we should encrypt the inode. */ + if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && + ext4_encrypted_inode(dir)) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 31f7db99ac9ef2..ac2f0f4970a4ea 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -877,6 +877,95 @@ int do_journal_get_write_access(handle_t *handle, static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct inode *inode = page->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + bool decrypt = false; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + bbits = ilog2(blocksize); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++ = bh; + decrypt = ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode); + } + } + /* + * If we issued read requests, let them complete. + */ + while (wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + else if (decrypt) + err = ext4_decrypt_one(inode, page); + return err; +} +#endif + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -939,11 +1028,19 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_should_dioread_nolock(inode)) + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block_write); + else + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block); +#else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); else ret = __block_write_begin(page, pos, len, ext4_get_block); - +#endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, page_buffers(page), from, to, NULL, @@ -2575,7 +2672,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ret = ext4_block_write_begin(page, pos, len, + ext4_da_get_block_prep); +#else ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); +#endif if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); @@ -3033,6 +3135,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); +#endif ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iter, offset, @@ -3096,6 +3201,11 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t ret; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; +#endif + /* * If we are doing data journalling we don't support O_DIRECT */ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b24a2541a9baaa..da3e4dff941744 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -69,6 +69,10 @@ static void ext4_finish_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct page *data_page = NULL; + struct ext4_crypto_ctx *ctx = NULL; +#endif struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; unsigned bio_end = bio_start + bvec->bv_len; @@ -78,6 +82,15 @@ static void ext4_finish_bio(struct bio *bio) if (!page) continue; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!page->mapping) { + /* The bounce data pages are unmapped. */ + data_page = page; + ctx = (struct ext4_crypto_ctx *)page_private(data_page); + page = ctx->control_page; + } +#endif + if (error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); @@ -102,8 +115,13 @@ static void ext4_finish_bio(struct bio *bio) } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); - if (!under_io) + if (!under_io) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + ext4_restore_control_page(data_page); +#endif end_page_writeback(page); + } } } @@ -378,6 +396,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, + struct page *page, struct buffer_head *bh) { int ret; @@ -391,7 +410,7 @@ static int io_submit_add_bh(struct ext4_io_submit *io, if (ret) return ret; } - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; io->io_next_block++; @@ -404,6 +423,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc, bool keep_towrite) { + struct page *data_page = NULL; struct inode *inode = page->mapping->host; unsigned block_start, blocksize; struct buffer_head *bh, *head; @@ -463,19 +483,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, set_buffer_async_write(bh); } while ((bh = bh->b_this_page) != head); - /* Now submit buffers to write */ bh = head = page_buffers(page); + + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); + data_page = NULL; + goto out; + } + } + + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bh); + ret = io_submit_add_bh(io, inode, + data_page ? data_page : page, bh); if (ret) { /* * We only get here on ENOMEM. Not much else * we can do but mark the page as dirty, and * better luck next time. */ - redirty_page_for_writepage(wbc, page); break; } nr_submitted++; @@ -484,6 +514,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* Error stopped previous loop? Clean up buffers... */ if (ret) { + out: + if (data_page) + ext4_restore_control_page(data_page); + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); do { clear_buffer_async_write(bh); bh = bh->b_this_page; From 5288dc73a32103877a1da873db60b37babe2104d Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 324/420] ext4 crypto: implement the ext4 decryption read path Change-Id: I69043c9b36be0f8db1e80dfba54382d5328d9d4b Signed-off-by: Michael Halcrow Signed-off-by: Ildar Muslukhov Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 18 +++++++++++-- fs/ext4/inode.c | 7 +++++ fs/ext4/readpage.c | 66 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af316..009fc128172dfa 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -200,6 +200,13 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct inode *inode = file->f_mapping->host; + + if (ext4_encrypted_inode(inode)) { + int err = ext4_generate_encryption_key(inode); + if (err) + return 0; + } file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; return 0; @@ -212,6 +219,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; + int ret; if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && !(sb->s_flags & MS_RDONLY))) { @@ -250,11 +258,17 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { - int ret = ext4_inode_attach_jinode(inode); + ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } - return dquot_file_open(inode, filp); + ret = dquot_file_open(inode, filp); + if (!ret && ext4_encrypted_inode(inode)) { + ret = ext4_generate_encryption_key(inode); + if (ret) + ret = -EACCES; + } + return ret; } /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ac2f0f4970a4ea..13a923f9d67676 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3385,6 +3385,13 @@ static int ext4_block_zero_page_range(handle_t *handle, /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; + if (S_ISREG(inode->i_mode) && + ext4_encrypted_inode(inode)) { + /* We expect the key to be set. */ + BUG_ON(!ext4_has_encryption_key(inode)); + BUG_ON(blocksize != PAGE_CACHE_SIZE); + WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index fff9fe6aacf85a..171b9ac4b45e94 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -46,6 +46,46 @@ #include "ext4.h" +/* + * Call ext4_decrypt on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_crypto_ctx *ctx = + container_of(work, struct ext4_crypto_ctx, work); + struct bio *bio = ctx->bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + int ret = ext4_decrypt(ctx, page); + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else + SetPageUptodate(page); + unlock_page(page); + } + ext4_release_crypto_ctx(ctx); + bio_put(bio); +#else + BUG(); +#endif +} + +static inline bool ext4_bio_encrypted(struct bio *bio) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return unlikely(bio->bi_private != NULL); +#else + return false; +#endif +} + /* * I/O completion handler for multipage BIOs. * @@ -63,6 +103,18 @@ static void mpage_end_io(struct bio *bio, int err) struct bio_vec *bv; int i; + if (ext4_bio_encrypted(bio)) { + struct ext4_crypto_ctx *ctx = bio->bi_private; + + if (err) { + ext4_release_crypto_ctx(ctx); + } else { + INIT_WORK(&ctx->work, completion_pages); + ctx->bio = bio; + queue_work(ext4_read_workqueue, &ctx->work); + return; + } + } bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; @@ -223,13 +275,25 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { + struct ext4_crypto_ctx *ctx = NULL; + + if (ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + goto set_error_page; + } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, bio_get_nr_vecs(bdev))); - if (!bio) + if (!bio) { + if (ctx) + ext4_release_crypto_ctx(ctx); goto set_error_page; + } bio->bi_bdev = bdev; bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; + bio->bi_private = ctx; } length = first_hole << blkbits; From 771f3ff8f2bd3f641bdba670f7f6d21b8fea37b3 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 325/420] ext4 crypto: filename encryption facilities Change-Id: I52b2ad72b4599a720f6f7db27acb6a39fa2265c9 Signed-off-by: Uday Savagaonkar Signed-off-by: Ildar Muslukhov Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 3 +- fs/ext4/crypto_fname.c | 709 ++++++++++++++++++++++++++++++++++++++++ fs/ext4/crypto_policy.c | 7 + fs/ext4/ext4.h | 41 +++ fs/ext4/ext4_crypto.h | 20 ++ 5 files changed, 779 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/crypto_fname.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 4e5af21f105065..75285ea9aa05a6 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -12,4 +12,5 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o crypto_key.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \ + crypto_key.o crypto_fname.o diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c new file mode 100644 index 00000000000000..ca2f5948c1ac52 --- /dev/null +++ b/fs/ext4/crypto_fname.c @@ -0,0 +1,709 @@ +/* + * linux/fs/ext4/crypto_fname.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains functions for filename crypto management in ext4 + * + * Written by Uday Savagaonkar, 2014. + * + * This has not yet undergone a rigorous security audit. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_crypto.h" +#include "xattr.h" + +/** + * ext4_dir_crypt_complete() - + */ +static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +bool ext4_valid_filenames_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); +} + +/** + * ext4_fname_encrypt() - + * + * This function encrypts the input filename, and returns the length of the + * ciphertext. Errors are returned as negative numbers. We trust the caller to + * allocate sufficient memory to oname string. + */ +static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname) +{ + u32 ciphertext_len; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct crypto_ablkcipher *tfm = ctx->ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + struct scatterlist sg[1]; + char *workbuf; + + if (iname->len <= 0 || iname->len > ctx->lim) + return -EIO; + + ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = (ciphertext_len > ctx->lim) + ? ctx->lim : ciphertext_len; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Map the workpage */ + workbuf = kmap(ctx->workpage); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + if (iname->len < ciphertext_len) + memset(workbuf + iname->len, 0, ciphertext_len - iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_table(sg, 1); + sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); + ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + if (res >= 0) { + /* Copy the result to output */ + memcpy(oname->name, workbuf, ciphertext_len); + res = ciphertext_len; + } + kunmap(ctx->workpage); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error (error code %d)\n", __func__, res); + } + oname->len = ciphertext_len; + return res; +} + +/* + * ext4_fname_decrypt() + * This function decrypts the input filename, and returns + * the length of the plaintext. + * Errors are returned as negative numbers. + * We trust the caller to allocate sufficient memory to oname string. + */ +static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + struct ext4_str tmp_in[2], tmp_out[1]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist sg[1]; + struct crypto_ablkcipher *tfm = ctx->ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + char *workbuf; + + if (iname->len <= 0 || iname->len > ctx->lim) + return -EIO; + + tmp_in[0].name = iname->name; + tmp_in[0].len = iname->len; + tmp_out[0].name = oname->name; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Map the workpage */ + workbuf = kmap(ctx->workpage); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_table(sg, 1); + sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); + ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + if (res >= 0) { + /* Copy the result to output */ + memcpy(oname->name, workbuf, iname->len); + res = iname->len; + } + kunmap(ctx->workpage); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", + __func__, res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return oname->len; +} + +/** + * ext4_fname_encode_digest() - + * + * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * The encoded string is roughly 4/3 times the size of the input string. + */ +int ext4_fname_encode_digest(char *dst, char *src, u32 len) +{ + static const char *lookup_table = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+"; + u32 current_chunk, num_chunks, i; + char tmp_buf[3]; + u32 c0, c1, c2, c3; + + current_chunk = 0; + num_chunks = len/3; + for (i = 0; i < num_chunks; i++) { + c0 = src[3*i] & 0x3f; + c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f; + c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f; + c3 = (src[3*i+2]>>2) & 0x3f; + dst[4*i] = lookup_table[c0]; + dst[4*i+1] = lookup_table[c1]; + dst[4*i+2] = lookup_table[c2]; + dst[4*i+3] = lookup_table[c3]; + } + if (i*3 < len) { + memset(tmp_buf, 0, 3); + memcpy(tmp_buf, &src[3*i], len-3*i); + c0 = tmp_buf[0] & 0x3f; + c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f; + c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f; + c3 = (tmp_buf[2]>>2) & 0x3f; + dst[4*i] = lookup_table[c0]; + dst[4*i+1] = lookup_table[c1]; + dst[4*i+2] = lookup_table[c2]; + dst[4*i+3] = lookup_table[c3]; + i++; + } + return (i * 4); +} + +/** + * ext4_fname_hash() - + * + * This function computes the hash of the input filename, and sets the output + * buffer to the *encoded* digest. It returns the length of the digest as its + * return value. Errors are returned as negative numbers. We trust the caller + * to allocate sufficient memory to oname string. + */ +static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + struct scatterlist sg; + struct hash_desc desc = { + .tfm = (struct crypto_hash *)ctx->htfm, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int res = 0; + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + res = ext4_fname_encode_digest(oname->name, iname->name, + iname->len); + oname->len = res; + return res; + } + + sg_init_one(&sg, iname->name, iname->len); + res = crypto_hash_init(&desc); + if (res) { + printk(KERN_ERR + "%s: Error initializing crypto hash; res = [%d]\n", + __func__, res); + goto out; + } + res = crypto_hash_update(&desc, &sg, iname->len); + if (res) { + printk(KERN_ERR + "%s: Error updating crypto hash; res = [%d]\n", + __func__, res); + goto out; + } + res = crypto_hash_final(&desc, + &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]); + if (res) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; res = [%d]\n", + __func__, res); + goto out; + } + /* Encode the digest as a printable string--this will increase the + * size of the digest */ + oname->name[0] = 'I'; + res = ext4_fname_encode_digest(oname->name+1, + &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE], + EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1; + oname->len = res; +out: + return res; +} + +/** + * ext4_free_fname_crypto_ctx() - + * + * Frees up a crypto context. + */ +void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) +{ + if (ctx == NULL || IS_ERR(ctx)) + return; + + if (ctx->ctfm && !IS_ERR(ctx->ctfm)) + crypto_free_ablkcipher(ctx->ctfm); + if (ctx->htfm && !IS_ERR(ctx->htfm)) + crypto_free_hash(ctx->htfm); + if (ctx->workpage && !IS_ERR(ctx->workpage)) + __free_page(ctx->workpage); + kfree(ctx); +} + +/** + * ext4_put_fname_crypto_ctx() - + * + * Return: The crypto context onto free list. If the free list is above a + * threshold, completely frees up the context, and returns the memory. + * + * TODO: Currently we directly free the crypto context. Eventually we should + * add code it to return to free list. Such an approach will increase + * efficiency of directory lookup. + */ +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) +{ + if (*ctx == NULL || IS_ERR(*ctx)) + return; + ext4_free_fname_crypto_ctx(*ctx); + *ctx = NULL; +} + +/** + * ext4_search_fname_crypto_ctx() - + */ +static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( + const struct ext4_encryption_key *key) +{ + return NULL; +} + +/** + * ext4_alloc_fname_crypto_ctx() - + */ +struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( + const struct ext4_encryption_key *key) +{ + struct ext4_fname_crypto_ctx *ctx; + + ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); + if (ctx == NULL) + return ERR_PTR(-ENOMEM); + if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { + /* This will automatically set key mode to invalid + * As enum for ENCRYPTION_MODE_INVALID is zero */ + memset(&ctx->key, 0, sizeof(ctx->key)); + } else { + memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); + } + ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) + ? 0 : 1; + ctx->ctfm_key_is_ready = 0; + ctx->ctfm = NULL; + ctx->htfm = NULL; + ctx->workpage = NULL; + return ctx; +} + +/** + * ext4_get_fname_crypto_ctx() - + * + * Allocates a free crypto context and initializes it to hold + * the crypto material for the inode. + * + * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. + */ +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( + struct inode *inode, u32 max_ciphertext_len) +{ + struct ext4_fname_crypto_ctx *ctx; + struct ext4_inode_info *ei = EXT4_I(inode); + int res; + + /* Check if the crypto policy is set on the inode */ + res = ext4_encrypted_inode(inode); + if (res == 0) + return NULL; + + if (!ext4_has_encryption_key(inode)) + ext4_generate_encryption_key(inode); + + /* Get a crypto context based on the key. + * A new context is allocated if no context matches the requested key. + */ + ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); + if (ctx == NULL) + ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); + if (IS_ERR(ctx)) + return ctx; + + if (ctx->has_valid_key) { + if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + printk_once(KERN_WARNING + "ext4: unsupported key mode %d\n", + ctx->key.mode); + return ERR_PTR(-ENOKEY); + } + + /* As a first cut, we will allocate new tfm in every call. + * later, we will keep the tfm around, in case the key gets + * re-used */ + if (ctx->ctfm == NULL) { + ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", + 0, 0); + } + if (IS_ERR(ctx->ctfm)) { + res = PTR_ERR(ctx->ctfm); + printk( + KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", + __func__, res); + ctx->ctfm = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->ctfm == NULL) { + printk( + KERN_DEBUG "%s: could not allocate crypto tfm\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + if (ctx->workpage == NULL) + ctx->workpage = alloc_page(GFP_NOFS); + if (IS_ERR(ctx->workpage)) { + res = PTR_ERR(ctx->workpage); + printk( + KERN_DEBUG "%s: error (%d) allocating work page\n", + __func__, res); + ctx->workpage = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->workpage == NULL) { + printk( + KERN_DEBUG "%s: could not allocate work page\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + ctx->lim = max_ciphertext_len; + crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + + /* If we are lucky, we will get a context that is already + * set up with the right key. Else, we will have to + * set the key */ + if (!ctx->ctfm_key_is_ready) { + /* Since our crypto objectives for filename encryption + * are pretty weak, + * we directly use the inode master key */ + res = crypto_ablkcipher_setkey(ctx->ctfm, + ctx->key.raw, ctx->key.size); + if (res) { + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-EIO); + } + ctx->ctfm_key_is_ready = 1; + } else { + /* In the current implementation, key should never be + * marked "ready" for a context that has just been + * allocated. So we should never reach here */ + BUG(); + } + } + if (ctx->htfm == NULL) + ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ctx->htfm)) { + res = PTR_ERR(ctx->htfm); + printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", + __func__, res); + ctx->htfm = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->htfm == NULL) { + printk(KERN_DEBUG "%s: could not allocate hash tfm\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + + return ctx; +} + +/** + * ext4_fname_crypto_round_up() - + * + * Return: The next multiple of block size + */ +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) +{ + return ((size+blksize-1)/blksize)*blksize; +} + +/** + * ext4_fname_crypto_namelen_on_disk() - + */ +int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, + u32 namelen) +{ + u32 ciphertext_len; + + if (ctx == NULL) + return -EIO; + if (!(ctx->has_valid_key)) + return -EACCES; + ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : namelen; + ciphertext_len = (ciphertext_len > ctx->lim) + ? ctx->lim : ciphertext_len; + return (int) ciphertext_len; +} + +/** + * ext4_fname_crypto_alloc_obuff() - + * + * Allocates an output buffer that is sufficient for the crypto operation + * specified by the context and the direction. + */ +int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, + u32 ilen, struct ext4_str *crypto_str) +{ + unsigned int olen; + + if (!ctx) + return -EIO; + olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE); + crypto_str->len = olen; + if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) + olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; + /* Allocated buffer can hold one more character to null-terminate the + * string */ + crypto_str->name = kmalloc(olen+1, GFP_NOFS); + if (!(crypto_str->name)) + return -ENOMEM; + return 0; +} + +/** + * ext4_fname_crypto_free_buffer() - + * + * Frees the buffer allocated for crypto operation. + */ +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} + +/** + * ext4_fname_disk_to_usr() - converts a filename from disk space to user space + */ +int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + if (ctx == NULL) + return -EIO; + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ctx->has_valid_key) + return ext4_fname_decrypt(ctx, iname, oname); + else + return ext4_fname_hash(ctx, iname, oname); +} + +int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname) +{ + struct ext4_str iname = {.name = (unsigned char *) de->name, + .len = de->name_len }; + + return _ext4_fname_disk_to_usr(ctx, &iname, oname); +} + + +/** + * ext4_fname_usr_to_disk() - converts a filename from user space to disk space + */ +int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname) +{ + int res; + + if (ctx == NULL) + return -EIO; + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && + iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ctx->has_valid_key) { + res = ext4_fname_encrypt(ctx, iname, oname); + return res; + } + /* Without a proper key, a user is not allowed to modify the filenames + * in a directory. Consequently, a user space name cannot be mapped to + * a disk-space name */ + return -EACCES; +} + +/* + * Calculate the htree hash from a filename from user space + */ +int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct dx_hash_info *hinfo) +{ + struct ext4_str tmp, tmp2; + int ret = 0; + + if (!ctx || !ctx->has_valid_key || + ((iname->name[0] == '.') && + ((iname->len == 1) || + ((iname->name[1] == '.') && (iname->len == 2))))) { + ext4fs_dirhash(iname->name, iname->len, hinfo); + return 0; + } + + /* First encrypt the plaintext name */ + ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); + if (ret < 0) + return ret; + + ret = ext4_fname_encrypt(ctx, iname, &tmp); + if (ret < 0) + goto out; + + tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; + tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL); + if (tmp2.name == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = ext4_fname_hash(ctx, &tmp, &tmp2); + if (ret > 0) + ext4fs_dirhash(tmp2.name, tmp2.len, hinfo); + ext4_fname_crypto_free_buffer(&tmp2); +out: + ext4_fname_crypto_free_buffer(&tmp); + return ret; +} + +/** + * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string + */ +int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_dir_entry_2 *de, + struct dx_hash_info *hinfo) +{ + struct ext4_str iname = {.name = (unsigned char *) de->name, + .len = de->name_len}; + struct ext4_str tmp; + int ret; + + if (!ctx || + ((iname.name[0] == '.') && + ((iname.len == 1) || + ((iname.name[1] == '.') && (iname.len == 2))))) { + ext4fs_dirhash(iname.name, iname.len, hinfo); + return 0; + } + + tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; + tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL); + if (tmp.name == NULL) + return -ENOMEM; + + ret = ext4_fname_hash(ctx, &iname, &tmp); + if (ret > 0) + ext4fs_dirhash(tmp.name, tmp.len, hinfo); + ext4_fname_crypto_free_buffer(&tmp); + return ret; +} diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a4bf762b3ba946..749ed6e91e50b6 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -59,6 +59,13 @@ static int ext4_create_encryption_context_from_policy( res = -EINVAL; goto out; } + if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + res = -EINVAL; + goto out; + } ctx.contents_encryption_mode = policy->contents_encryption_mode; ctx.filenames_encryption_mode = policy->filenames_encryption_mode; BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 61eb6b533c1c4e..4eb0d6dcbe8ea7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2070,6 +2070,47 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) } #endif +/* crypto_fname.c */ +bool ext4_valid_filenames_enc_mode(uint32_t mode); +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); +int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, + u32 ilen, struct ext4_str *crypto_str); +int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_str *iname, + struct ext4_str *oname); +int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname); +int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname); +int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct dx_hash_info *hinfo); +int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_dir_entry_2 *de, + struct dx_hash_info *hinfo); +int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, + u32 namelen); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, + u32 max_len); +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +#else +static inline +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } +static inline +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, + u32 max_len) +{ + return NULL; +} +static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +#endif + + /* crypto_key.c */ int ext4_generate_encryption_key(struct inode *inode); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 6a7c0c06b2be65..f7d46e8dc9d30c 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -104,4 +104,24 @@ static inline int ext4_encryption_key_size(int mode) return 0; } +#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4 +#define EXT4_CRYPTO_BLOCK_SIZE 16 +#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32 + +struct ext4_str { + unsigned char *name; + u32 len; +}; + +struct ext4_fname_crypto_ctx { + u32 lim; + char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; + struct crypto_ablkcipher *ctfm; + struct crypto_hash *htfm; + struct page *workpage; + struct ext4_encryption_key key; + unsigned has_valid_key : 1; + unsigned ctfm_key_is_ready : 1; +}; + #endif /* _EXT4_CRYPTO_H */ From 42025fdf92927b988fd4add24efd50737c95d8fd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 326/420] ext4 crypto: teach ext4_htree_store_dirent() to store decrypted filenames For encrypted directories, we need to pass in a separate parameter for the decrypted filename, since the directory entry contains the encrypted filename. Change-Id: Ie28cbb198c41daa743a3a18ab25ff2e4d016c275 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 15 ++++++++++----- fs/ext4/ext4.h | 5 +++-- fs/ext4/inline.c | 7 +++++-- fs/ext4/namei.c | 21 +++++++++++++++++---- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index c24143ea9c08e4..f67f9559160d94 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -384,10 +384,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p) /* * Given a directory entry, enter it into the fname rb tree. + * + * When filename encryption is enabled, the dirent will hold the + * encrypted filename, while the htree will hold decrypted filename. + * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, - struct ext4_dir_entry_2 *dirent) + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; @@ -398,17 +403,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + dirent->name_len + 1; + len = sizeof(struct fname) + ent_name->len + 1; new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; + new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; - memcpy(new_fn->name, dirent->name, dirent->name_len); - new_fn->name[dirent->name_len] = 0; + memcpy(new_fn->name, ent_name->name, ent_name->len); + new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4eb0d6dcbe8ea7..f613337a0d7cef 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2134,8 +2134,9 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext4_dir_entry_2 *dirent); + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3ea62695abce7b..956a32badc0123 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1323,6 +1323,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; + struct ext4_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1394,8 +1395,10 @@ int htree_inlinedir_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de); + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, hinfo->hash, + hinfo->minor_hash, de, &tmp_str); if (err) { count = err; goto out; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f2fab7a01dd5a3..c6333c3683aef2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -878,6 +878,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; + struct ext4_str tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -904,8 +905,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if ((err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de)) != 0) { + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, &tmp_str); + if (err != 0) { brelse(bh); return err; } @@ -935,6 +939,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int count = 0; int ret, err; __u32 hashval; + struct ext4_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); @@ -970,14 +975,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 0, 0, + de, &tmp_str); + if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); - if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 2, 0, + de, &tmp_str); + if (err != 0) goto errout; count++; } From a96a4e88f53bec9363b8bd1a902eb5258ca42685 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 327/420] ext4 crypto: insert encrypted filenames into a leaf directory block Change-Id: I7c222d4b7ba8be500f228d24598ec4f402baffd3 Signed-off-by: Uday Savagaonkar Signed-off-by: Ildar Muslukhov Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++- fs/ext4/inline.c | 7 +++-- fs/ext4/namei.c | 81 ++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 79 insertions(+), 13 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f613337a0d7cef..0c4e5a70dda436 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2143,9 +2143,11 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, const char *name, int namelen, struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, + const struct qstr *iname, const char *name, int namelen); static inline void ext4_update_dx_flag(struct inode *inode) { diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 956a32badc0123..1805b51e2a0d58 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -11,11 +11,13 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ + +#include + #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" -#include #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -1010,7 +1012,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(inode, de, inline_size, name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, + name, namelen); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c6333c3683aef2..02f959bb5a3728 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1666,19 +1666,49 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return 0; } -void ext4_insert_dentry(struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const char *name, int namelen) +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const struct qstr *iname, + const char *name, int namelen) { int nlen, rlen; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + struct ext4_str tmp_str; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -EIO; + /* By default, the input name would be written to the disk */ + tmp_str.name = (unsigned char *)name; + tmp_str.len = namelen; + if (ctx != NULL) { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return -ENOMEM; + } + res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; + } + tmp_str.name = fname_crypto_str.name; + tmp_str.len = fname_crypto_str.len; + } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = - (struct ext4_dir_entry_2 *)((char *)de + nlen); + (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; @@ -1686,9 +1716,14 @@ void ext4_insert_dentry(struct inode *inode, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); + de->name_len = tmp_str.len; + + memcpy(de->name, tmp_str.name, tmp_str.len); + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return 0; } + /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1725,8 +1760,12 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, return err; } - /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* By now the buffer is marked for journaling. Due to crypto operations, + * the following function call may fail */ + err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, + name, namelen); + if (err < 0) + return err; /* * XXX shouldn't update any times until successful @@ -1758,8 +1797,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { struct inode *dir = dentry->d_parent->d_inode; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; +#else const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; +#endif struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -1773,7 +1817,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; - int csum_size = 0; + int csum_size = 0; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); +#endif if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -1840,7 +1890,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + ext4_mark_inode_dirty(handle, dir); + brelse(bh); + return res; + } + ext4_put_fname_crypto_ctx(&ctx); +#else ext4fs_dirhash(name, namelen, &hinfo); +#endif memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; From 1a038321d4016684efd2d3052b2cb53a8ab6e4a2 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 328/420] ext4 crypto: partial update to namei.c for fname crypto Modifies dx_show_leaf and dx_probe to support fname encryption. Filename encryption not yet enabled. Change-Id: Ib95c5819180bc67f8904f677224e1b7e02019aaf Signed-off-by: Uday Savagaonkar Signed-off-by: Ildar Muslukhov Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 109 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 8 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 02f959bb5a3728..346f02a09cf312 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -586,8 +586,10 @@ struct stats unsigned bcount; }; -static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, - int size, int show_names) +static struct stats dx_show_leaf(struct inode *dir, + struct dx_hash_info *hinfo, + struct ext4_dir_entry_2 *de, + int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; @@ -600,12 +602,80 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent { if (show_names) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + int len; + char *name; + struct ext4_str fname_crypto_str + = {.name = NULL, .len = 0}; + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; + + name = de->name; + len = de->name_len; + ctx = ext4_get_fname_crypto_ctx(dir, + EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + printk(KERN_WARNING "Error acquiring" + " crypto ctxt--skipping crypto\n"); + ctx = NULL; + } + if (ctx == NULL) { + /* Directory is not encrypted */ + ext4fs_dirhash(de->name, + de->name_len, &h); + printk("%*.s:(U)%x.%u ", len, + name, h.hash, + (unsigned) ((char *) de + - base)); + } else { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer( + ctx, de->name_len, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "allocating crypto " + "buffer--skipping " + "crypto\n"); + ext4_put_fname_crypto_ctx(&ctx); + ctx = NULL; + } + res = ext4_fname_disk_to_usr(ctx, de, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "converting filename " + "from disk to usr" + "\n"); + name = "??"; + len = 2; + } else { + name = fname_crypto_str.name; + len = fname_crypto_str.len; + } + res = ext4_fname_disk_to_hash(ctx, de, + &h); + if (res < 0) { + printk(KERN_WARNING "Error " + "converting filename " + "from disk to htree" + "\n"); + h.hash = 0xDEADBEEF; + } + printk("%*.s:(E)%x.%u ", len, name, + h.hash, (unsigned) ((char *) de + - base)); + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer( + &fname_crypto_str); + } +#else int len = de->name_len; char *name = de->name; - while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); - printk(":%x.%u ", h.hash, + printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); +#endif } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -623,7 +693,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; - int err; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { @@ -637,7 +706,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): - dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); + dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) + bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; @@ -687,8 +757,28 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (d_name) { + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; + + /* Check if the directory is encrypted */ + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + ret_err = ERR_PTR(PTR_ERR(ctx)); + goto fail; + } + res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); + if (res < 0) { + ret_err = ERR_PTR(res); + goto fail; + } + ext4_put_fname_crypto_ctx(&ctx); + } +#else if (d_name) ext4fs_dirhash(d_name->name, d_name->len, hinfo); +#endif hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -773,6 +863,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, brelse(frame->bh); frame--; } + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning(dir->i_sb, "Corrupt dir inode %lu, running e2fsck is " @@ -1605,8 +1696,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, initialize_dirent_tail(t, blocksize); } - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, + blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { From b317d500bca8ba86a69874dab90f80324253c640 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 329/420] ext4 crypto: filename encryption modifications Modifies htree_dirblock_to_tree, dx_make_map, ext4_match search_dir, and ext4_find_dest_de to support fname crypto. Filename encryption feature is not yet enabled at this patch. Change-Id: Iace4aac9931d7ddeafe494c8d5ab09c6d356f0b1 Signed-off-by: Uday Savagaonkar Signed-off-by: Ildar Muslukhov Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 248 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 204 insertions(+), 44 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 346f02a09cf312..a2c3a13e696f64 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -254,8 +254,9 @@ static struct dx_frame *dx_probe(const struct qstr *d_name, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, + struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, struct dx_map_entry *offsets, int count, unsigned blocksize); @@ -969,7 +970,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_str tmp_str; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -981,6 +983,24 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Check if the directory is encrypted */ + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + brelse(bh); + return err; + } + if (ctx != NULL) { + err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + ext4_put_fname_crypto_ctx(&ctx); + brelse(bh); + return err; + } + } +#endif for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -989,24 +1009,52 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + err = ext4_fname_disk_to_hash(ctx, de, hinfo); + if (err < 0) { + count = err; + goto errout; + } +#else ext4fs_dirhash(de->name, de->name_len, hinfo); +#endif if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; - tmp_str.name = de->name; - tmp_str.len = de->name_len; - err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de, &tmp_str); + if (ctx == NULL) { + /* Directory is not encrypted */ + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(ctx, de, + &fname_crypto_str); + if (err < 0) { + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); + } if (err != 0) { - brelse(bh); - return err; + count = err; + goto errout; } count++; } +errout: brelse(bh); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif return count; } @@ -1139,17 +1187,33 @@ static inline int search_dirblock(struct buffer_head *bh, * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_fname_crypto_ctx *ctx = NULL; + int err; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); +#endif while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + err = ext4_fname_disk_to_hash(ctx, de, &h); + if (err < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return err; + } +#else ext4fs_dirhash(de->name, de->name_len, &h); +#endif map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1160,6 +1224,9 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, /* XXX: do we need to check rec_len == 0 case? -Chris */ de = ext4_next_entry(de, blocksize); } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&ctx); +#endif return count; } @@ -1210,57 +1277,107 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match (int len, const char * const name, - struct ext4_dir_entry_2 * de) +static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, + struct ext4_str *fname_crypto_str, + int len, const char * const name, + struct ext4_dir_entry_2 *de) { - if (len != de->name_len) - return 0; + int res; + if (!de->inode) return 0; - return !memcmp(name, de->name, len); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) { + /* Directory is encrypted */ + res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str); + if (res < 0) + return res; + if (len != res) + return 0; + res = memcmp(name, fname_crypto_str->name, len); + return (res == 0) ? 1 : 0; + } +#endif + if (len != de->name_len) + return 0; + res = memcmp(name, de->name, len); + return (res == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir) +int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; const char *name = d_name->name; int namelen = d_name->len; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -1; + + if (ctx != NULL) { + /* Allocate buffer to hold maximum name length */ + res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return -1; + } + } de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ + if ((char *) de + de->name_len <= dlimit) { + res = ext4_match(ctx, &fname_crypto_str, namelen, + name, de); + if (res < 0) { + res = -1; + goto return_result; + } + if (res > 0) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, + bh->b_size, offset)) { + res = -1; + goto return_result; + } + *res_dir = de; + res = 1; + goto return_result; + } - if ((char *) de + namelen <= dlimit && - ext4_match (namelen, name, de)) { - /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) - return -1; - *res_dir = de; - return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) - return -1; + if (de_len <= 0) { + res = -1; + goto return_result; + } offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - return 0; + + res = 0; +return_result: + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1449,6 +1566,9 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q ext4_lblk_t block; int retval; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + *res_dir = NULL; +#endif frame = dx_probe(d_name, dir, &hinfo, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; @@ -1657,7 +1777,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map((struct ext4_dir_entry_2 *) data1, + count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1, blocksize, hinfo, map); map -= count; dx_sort_map(map, count); @@ -1680,7 +1800,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); + de2 = dx_move_dirents(data1, data2, map + split, count - split, + blocksize); de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, @@ -1736,15 +1857,48 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, int nlen, rlen; unsigned int offset = 0; char *top; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -1; + + if (ctx != NULL) { + /* Calculate record length needed to store the entry */ + res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return res; + } + reclen = EXT4_DIR_REC_LEN(res); + + /* Allocate buffer to hold maximum name length */ + res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return -1; + } + } de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; + buf, buf_size, offset)) { + res = -EIO; + goto return_result; + } + /* Provide crypto context and crypto buffer to ext4 match */ + res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + if (res < 0) + goto return_result; + if (res > 0) { + res = -EEXIST; + goto return_result; + } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1752,11 +1906,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - return -ENOSPC; - *dest_de = de; - return 0; + if ((char *) de > top) + res = -ENOSPC; + else { + *dest_de = de; + res = 0; + } +return_result: + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; } int ext4_insert_dentry(struct inode *dir, From 4d7ab2a5e19801287ec1a0a68224ed505840ce69 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 330/420] ext4 crypto: enable filename encryption Change-Id: I25e65769ac98e5c0aa0d1a8b9ad765fcd300e8eb Signed-off-by: Uday Savagaonkar Signed-off-by: Ildar Muslukhov Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 64 ++++++++++++++++++++++++++++++++++++------------ fs/ext4/ialloc.c | 21 ++++++++++++++-- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f67f9559160d94..2b6e0c84a91194 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -110,7 +110,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; int dir_has_error = 0; + struct ext4_fname_crypto_ctx *enc_ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); @@ -127,17 +130,28 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (ext4_has_inline_data(inode)) { int has_inline_data = 1; - int ret = ext4_read_inline_dir(file, ctx, + err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) - return ret; + return err; + } + + enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); + if (IS_ERR(enc_ctx)) + return PTR_ERR(enc_ctx); + if (enc_ctx) { + err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + ext4_put_fname_crypto_ctx(&enc_ctx); + return err; + } } offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; - struct buffer_head *bh = NULL; map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; @@ -180,6 +194,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); + bh = NULL; continue; } set_buffer_verified(bh); @@ -226,25 +241,44 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (!dir_emit(ctx, de->name, - de->name_len, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type))) { - brelse(bh); - return 0; + if (enc_ctx == NULL) { + /* Directory is not encrypted */ + if (!dir_emit(ctx, de->name, + de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } else { + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(enc_ctx, + de, &fname_crypto_str); + if (err < 0) + goto errout; + if (!dir_emit(ctx, + fname_crypto_str.name, err, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } - offset = 0; + if ((ctx->pos < inode->i_size) && !dir_relax(inode)) + goto done; brelse(bh); - if (ctx->pos < inode->i_size) { - if (!dir_relax(inode)) - return 0; - } + bh = NULL; + offset = 0; } - return 0; +done: + err = 0; +errout: +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&enc_ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif + brelse(bh); + return err; } static inline int is_32bit_api(void) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e554ca344047c3..8f37c9ea6d5537 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1034,11 +1034,28 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; - +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && + (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } else { + /* Inline data and encryption are incompatible + * We turn off inline data since encryption is enabled */ + ei->i_inline_off = 1; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } +#else ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - +#endif ret = inode; err = dquot_alloc_inode(inode); if (err) From 587acfc52cd5125c356188efea66c7aced06c12c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 331/420] ext4 crypto: Add symlink encryption Change-Id: Ia3a2f08b64023cdaaa8bed57a8ae19913ea5729a Signed-off-by: Uday Savagaonkar Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/ext4_crypto.h | 20 +++++++++ fs/ext4/inode.c | 5 ++- fs/ext4/namei.c | 85 +++++++++++++++++++++++++++++--------- fs/ext4/symlink.c | 96 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 184 insertions(+), 23 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0c4e5a70dda436..aa589414a1946e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2226,6 +2226,7 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *, unsigned long blkdev_flags); /* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index f7d46e8dc9d30c..c2ba35a914b65f 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -124,4 +124,24 @@ struct ext4_fname_crypto_ctx { unsigned ctfm_key_is_ready : 1; }; +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct ext4_encrypted_symlink_data { + __le16 len; + char encrypted_path[1]; +} __attribute__((__packed__)); + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 encrypted_symlink_data_len(u32 l) +{ + if (l < EXT4_CRYPTO_BLOCK_SIZE) + l = EXT4_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); +} + #endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 13a923f9d67676..93250f40c77c64 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -141,7 +141,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. */ -static int ext4_inode_is_fast_symlink(struct inode *inode) +int ext4_inode_is_fast_symlink(struct inode *inode) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; @@ -4198,7 +4198,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode)) { + if (ext4_inode_is_fast_symlink(inode) && + !ext4_encrypted_inode(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a2c3a13e696f64..180231c9ba5f4d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3187,16 +3187,24 @@ static int ext4_symlink(struct inode *dir, { handle_t *handle; struct inode *inode; - int l, err, retries = 0; + int err, len = strlen(symname); int credits; + bool encryption_required; + struct ext4_str disk_link; + struct ext4_encrypted_symlink_data *sd = NULL; - l = strlen(symname)+1; - if (l > dir->i_sb->s_blocksize) + disk_link.len = len + 1; + disk_link.name = (char *) symname; + + encryption_required = ext4_encrypted_inode(dir); + if (encryption_required) + disk_link.len = encrypted_symlink_data_len(len) + 1; + if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; dquot_initialize(dir); - if (l > EXT4_N_BLOCKS * 4) { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, @@ -3215,16 +3223,49 @@ static int ext4_symlink(struct inode *dir, credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } -retry: + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; + if (IS_ERR(inode)) { + if (handle) + ext4_journal_stop(handle); + return PTR_ERR(inode); + } - if (l > EXT4_N_BLOCKS * 4) { + if (encryption_required) { + struct ext4_fname_crypto_ctx *ctx = NULL; + struct qstr istr; + struct ext4_str ostr; + + sd = kzalloc(disk_link.len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto err_drop_inode; + } + err = ext4_inherit_context(dir, inode); + if (err) + goto err_drop_inode; + ctx = ext4_get_fname_crypto_ctx(inode, + inode->i_sb->s_blocksize); + if (IS_ERR_OR_NULL(ctx)) { + /* We just set the policy, so ctx should not be NULL */ + err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); + goto err_drop_inode; + } + istr.name = (const unsigned char *) symname; + istr.len = len; + ostr.name = sd->encrypted_path; + err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); + ext4_put_fname_crypto_ctx(&ctx); + if (err < 0) + goto err_drop_inode; + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *) sd; + } + + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* @@ -3240,9 +3281,10 @@ static int ext4_symlink(struct inode *dir, drop_nlink(inode); err = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); + handle = NULL; if (err) goto err_drop_inode; - err = __page_symlink(inode, symname, l, 1); + err = __page_symlink(inode, disk_link.name, disk_link.len, 1); if (err) goto err_drop_inode; /* @@ -3254,34 +3296,37 @@ static int ext4_symlink(struct inode *dir, EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); + handle = NULL; goto err_drop_inode; } set_nlink(inode, 1); err = ext4_orphan_del(handle, inode); - if (err) { - ext4_journal_stop(handle); - clear_nlink(inode); + if (err) goto err_drop_inode; - } } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - inode->i_op = &ext4_fast_symlink_inode_operations; - memcpy((char *)&EXT4_I(inode)->i_data, symname, l); - inode->i_size = l-1; + inode->i_op = encryption_required ? + &ext4_symlink_inode_operations : + &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, + disk_link.len); + inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); -out_stop: if (handle) ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; + kfree(sd); return err; err_drop_inode: + if (handle) + ext4_journal_stop(handle); + kfree(sd); + clear_nlink(inode); unlock_new_inode(inode); iput(inode); return err; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ff371193201841..3f7227498f0a29 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,7 +23,96 @@ #include "ext4.h" #include "xattr.h" +#ifdef CONFIG_EXT4_FS_ENCRYPTION static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *cpage = NULL; + char *caddr, *paddr = NULL; + struct ext4_str cstr, pstr; + struct inode *inode = dentry->d_inode; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_encrypted_symlink_data *sd; + loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); + int res; + u32 plen, max_size = inode->i_sb->s_blocksize; + + if (!ext4_encrypted_inode(inode)) + return page_follow_link_light(dentry, nd); + + ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); + if (IS_ERR(ctx)) + return ctx; + + if (ext4_inode_is_fast_symlink(inode)) { + caddr = (char *) EXT4_I(dentry->d_inode)->i_data; + max_size = sizeof(EXT4_I(dentry->d_inode)->i_data); + } else { + cpage = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(cpage)) { + ext4_put_fname_crypto_ctx(&ctx); + return cpage; + } + caddr = kmap(cpage); + caddr[size] = 0; + } + + /* Symlink is encrypted */ + sd = (struct ext4_encrypted_symlink_data *)caddr; + cstr.name = sd->encrypted_path; + cstr.len = le32_to_cpu(sd->len); + if ((cstr.len + + sizeof(struct ext4_encrypted_symlink_data) - 1) > + max_size) { + /* Symlink data on the disk is corrupted */ + res = -EIO; + goto errout; + } + plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? + EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len; + paddr = kmalloc(plen + 1, GFP_NOFS); + if (!paddr) { + res = -ENOMEM; + goto errout; + } + pstr.name = paddr; + res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr); + if (res < 0) + goto errout; + /* Null-terminate the name */ + if (res <= plen) + paddr[res] = '\0'; + nd_set_link(nd, paddr); + ext4_put_fname_crypto_ctx(&ctx); + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + return NULL; +errout: + ext4_put_fname_crypto_ctx(&ctx); + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + kfree(paddr); + return ERR_PTR(res); +} + +static void ext4_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct page *page = cookie; + + if (!page) { + kfree(nd_get_link(nd)); + } else { + kunmap(page); + page_cache_release(page); + } +} +#endif + +static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd) { struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); nd_set_link(nd, (char *) ei->i_data); @@ -32,8 +121,13 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, +#ifdef CONFIG_EXT4_FS_ENCRYPTION + .follow_link = ext4_follow_link, + .put_link = ext4_put_link, +#else .follow_link = page_follow_link_light, .put_link = page_put_link, +#endif .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -43,7 +137,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_follow_link, + .follow_link = ext4_follow_fast_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, From dbaee6b1ec00f3055748d3c095999b701515233d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 332/420] ext4 crypto: enable encryption feature flag Also add the test dummy encryption mode flag so we can more easily test the encryption patches using xfstests. Change-Id: I30b08de00e5fbf2efcc13d2e8ffb0f625a21e4ce Signed-off-by: Michael Halcrow Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_key.c | 27 +++++++++++++++------------ fs/ext4/crypto_policy.c | 18 +++++++++++++++--- fs/ext4/ext4.h | 17 +++++++++++++---- fs/ext4/ialloc.c | 3 ++- fs/ext4/namei.c | 9 ++++++--- fs/ext4/super.c | 29 ++++++++++++++++++++++++++++- 6 files changed, 79 insertions(+), 24 deletions(-) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 572bd97f58dd60..c8392af8abbbbd 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -98,6 +98,7 @@ int ext4_generate_encryption_key(struct inode *inode) struct ext4_encryption_key *master_key; struct ext4_encryption_context ctx; struct user_key_payload *ukp; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx)); @@ -109,6 +110,20 @@ int ext4_generate_encryption_key(struct inode *inode) } res = 0; + if (S_ISREG(inode->i_mode)) + crypt_key->mode = ctx.contents_encryption_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + crypt_key->mode = ctx.filenames_encryption_mode; + else { + printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + BUG(); + } + crypt_key->size = ext4_encryption_key_size(crypt_key->mode); + BUG_ON(!crypt_key->size); + if (DUMMY_ENCRYPTION_ENABLED(sbi)) { + memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto out; + } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, EXT4_KEY_DESC_PREFIX_SIZE); sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, @@ -129,21 +144,9 @@ int ext4_generate_encryption_key(struct inode *inode) goto out; } master_key = (struct ext4_encryption_key *)ukp->data; - - if (S_ISREG(inode->i_mode)) - crypt_key->mode = ctx.contents_encryption_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_key->mode = ctx.filenames_encryption_mode; - else { - printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); - BUG(); - } - crypt_key->size = ext4_encryption_key_size(crypt_key->mode); - BUG_ON(!crypt_key->size); BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); - BUG_ON(crypt_key->size < EXT4_AES_256_CBC_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); out: if (keyring_key) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 749ed6e91e50b6..30eaf9e9864a96 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -169,13 +169,25 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return -ENOENT; - + if (res != sizeof(ctx)) { + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + ctx.contents_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + goto out; + } + } get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); +out: if (!res) ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); return res; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index aa589414a1946e..8275763b6a266e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1194,8 +1194,16 @@ struct ext4_super_block { /* * run-time mount flags */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif /* Number of quota types we support */ #define EXT4_MAXQUOTAS 2 @@ -1605,8 +1613,9 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP | \ - EXT4_FEATURE_INCOMPAT_INLINE_DATA) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8f37c9ea6d5537..12571b47b75342 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -999,7 +999,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, /* If the directory encrypted, then we should encrypt the inode. */ if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && - ext4_encrypted_inode(dir)) + (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(sbi))) ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_set_inode_flags(inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 180231c9ba5f4d..68df8aa1b2ae1c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2578,7 +2578,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, ext4_set_aops(inode); err = 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (!err && ext4_encrypted_inode(dir)) { + if (!err && (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { err = ext4_inherit_context(dir, inode); if (err) { clear_nlink(inode); @@ -2770,7 +2771,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_clear_inode; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(dir)) { + if (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { err = ext4_inherit_context(dir, inode); if (err) goto out_clear_inode; @@ -3196,7 +3198,8 @@ static int ext4_symlink(struct inode *dir, disk_link.len = len + 1; disk_link.name = (char *) symname; - encryption_required = ext4_encrypted_inode(dir); + encryption_required = (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); if (encryption_required) disk_link.len = encrypted_symlink_data_len(len) + 1; if (disk_link.len > dir->i_sb->s_blocksize) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 66757f0193e437..a1490673e46b54 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1141,7 +1141,7 @@ enum { Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, @@ -1227,6 +1227,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1425,6 +1426,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, {Opt_err, 0, 0} }; @@ -1595,6 +1597,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (token == Opt_test_dummy_encryption) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2681,11 +2692,13 @@ static struct attribute *ext4_attrs[] = { EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); EXT4_INFO_ATTR(meta_bg_resize); +EXT4_INFO_ATTR(encryption); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), + ATTR_LIST(encryption), NULL, }; @@ -3669,6 +3682,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) && + es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + goto failed_mount; + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4031,6 +4051,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + !(sb->s_flags & MS_RDONLY) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + ext4_commit_super(sb, 1); + } + /* * Get the # of file system overhead blocks from the * superblock if present. From df0e54e777750b83713317655887d463112c414f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 333/420] ext4 crypto: simplify and speed up filename encryption Avoid using SHA-1 when calculating the user-visible filename when the encryption key is available, and avoid decrypting lots of filenames when searching for a directory entry in a directory block. Change-Id: Iaa566fdbb274e6def7c70282b737a07ad5c2c50c Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_fname.c | 268 +++++++++++++++++++++-------------------- fs/ext4/dir.c | 2 +- fs/ext4/ext4.h | 9 +- fs/ext4/namei.c | 72 +---------- fs/ext4/symlink.c | 2 +- 5 files changed, 149 insertions(+), 204 deletions(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index ca2f5948c1ac52..7a877e609e5fba 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -198,106 +198,57 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, return oname->len; } +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + /** * ext4_fname_encode_digest() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. */ -int ext4_fname_encode_digest(char *dst, char *src, u32 len) +static int digest_encode(const char *src, int len, char *dst) { - static const char *lookup_table = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+"; - u32 current_chunk, num_chunks, i; - char tmp_buf[3]; - u32 c0, c1, c2, c3; - - current_chunk = 0; - num_chunks = len/3; - for (i = 0; i < num_chunks; i++) { - c0 = src[3*i] & 0x3f; - c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f; - c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f; - c3 = (src[3*i+2]>>2) & 0x3f; - dst[4*i] = lookup_table[c0]; - dst[4*i+1] = lookup_table[c1]; - dst[4*i+2] = lookup_table[c2]; - dst[4*i+3] = lookup_table[c3]; - } - if (i*3 < len) { - memset(tmp_buf, 0, 3); - memcpy(tmp_buf, &src[3*i], len-3*i); - c0 = tmp_buf[0] & 0x3f; - c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f; - c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f; - c3 = (tmp_buf[2]>>2) & 0x3f; - dst[4*i] = lookup_table[c0]; - dst[4*i+1] = lookup_table[c1]; - dst[4*i+2] = lookup_table[c2]; - dst[4*i+3] = lookup_table[c3]; + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); i++; } - return (i * 4); + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; } -/** - * ext4_fname_hash() - - * - * This function computes the hash of the input filename, and sets the output - * buffer to the *encoded* digest. It returns the length of the digest as its - * return value. Errors are returned as negative numbers. We trust the caller - * to allocate sufficient memory to oname string. - */ -static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_str *iname, - struct ext4_str *oname) +static int digest_decode(const char *src, int len, char *dst) { - struct scatterlist sg; - struct hash_desc desc = { - .tfm = (struct crypto_hash *)ctx->htfm, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; - int res = 0; - - if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { - res = ext4_fname_encode_digest(oname->name, iname->name, - iname->len); - oname->len = res; - return res; - } - - sg_init_one(&sg, iname->name, iname->len); - res = crypto_hash_init(&desc); - if (res) { - printk(KERN_ERR - "%s: Error initializing crypto hash; res = [%d]\n", - __func__, res); - goto out; - } - res = crypto_hash_update(&desc, &sg, iname->len); - if (res) { - printk(KERN_ERR - "%s: Error updating crypto hash; res = [%d]\n", - __func__, res); - goto out; - } - res = crypto_hash_final(&desc, - &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]); - if (res) { - printk(KERN_ERR - "%s: Error finalizing crypto hash; res = [%d]\n", - __func__, res); - goto out; + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; } - /* Encode the digest as a printable string--this will increase the - * size of the digest */ - oname->name[0] = 'I'; - res = ext4_fname_encode_digest(oname->name+1, - &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE], - EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1; - oname->len = res; -out: - return res; + if (ac) + return -1; + return cp - dst; } /** @@ -571,9 +522,13 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_str *iname, - struct ext4_str *oname) + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname) { + char buf[24]; + int ret; + if (ctx == NULL) return -EIO; if (iname->len < 3) { @@ -587,18 +542,33 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, } if (ctx->has_valid_key) return ext4_fname_decrypt(ctx, iname, oname); - else - return ext4_fname_hash(ctx, iname, oname); + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hinfo) { + memcpy(buf, &hinfo->hash, 4); + memcpy(buf+4, &hinfo->minor_hash, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name+1); + oname->len = ret + 1; + return ret + 1; } int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) { struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, &iname, oname); + return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); } @@ -640,10 +610,11 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct dx_hash_info *hinfo) { - struct ext4_str tmp, tmp2; + struct ext4_str tmp; int ret = 0; + char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; - if (!ctx || !ctx->has_valid_key || + if (!ctx || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { @@ -651,59 +622,90 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, return 0; } + if (!ctx->has_valid_key && iname->name[0] == '_') { + if (iname->len != 33) + return -ENOENT; + ret = digest_decode(iname->name+1, iname->len, buf); + if (ret != 24) + return -ENOENT; + memcpy(&hinfo->hash, buf, 4); + memcpy(&hinfo->minor_hash, buf + 4, 4); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] != '_') { + if (iname->len > 43) + return -ENOENT; + ret = digest_decode(iname->name, iname->len, buf); + ext4fs_dirhash(buf, ret, hinfo); + return 0; + } + /* First encrypt the plaintext name */ ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); if (ret < 0) return ret; ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret < 0) - goto out; - - tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; - tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL); - if (tmp2.name == NULL) { - ret = -ENOMEM; - goto out; + if (ret >= 0) { + ext4fs_dirhash(tmp.name, tmp.len, hinfo); + ret = 0; } - ret = ext4_fname_hash(ctx, &tmp, &tmp2); - if (ret > 0) - ext4fs_dirhash(tmp2.name, tmp2.len, hinfo); - ext4_fname_crypto_free_buffer(&tmp2); -out: ext4_fname_crypto_free_buffer(&tmp); return ret; } -/** - * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string - */ -int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_dir_entry_2 *de, - struct dx_hash_info *hinfo) +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de) { - struct ext4_str iname = {.name = (unsigned char *) de->name, - .len = de->name_len}; - struct ext4_str tmp; - int ret; + int ret = -ENOENT; + int bigname = (*name == '_'); - if (!ctx || - ((iname.name[0] == '.') && - ((iname.len == 1) || - ((iname.name[1] == '.') && (iname.len == 2))))) { - ext4fs_dirhash(iname.name, iname.len, hinfo); - return 0; + if (ctx->has_valid_key) { + if (cstr->name == NULL) { + struct qstr istr; + + ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); + if (ret < 0) + goto errout; + istr.name = name; + istr.len = len; + ret = ext4_fname_encrypt(ctx, &istr, cstr); + if (ret < 0) + goto errout; + } + } else { + if (cstr->name == NULL) { + cstr->name = kmalloc(32, GFP_KERNEL); + if (cstr->name == NULL) + return -ENOMEM; + if ((bigname && (len != 33)) || + (!bigname && (len > 43))) + goto errout; + ret = digest_decode(name+bigname, len-bigname, + cstr->name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + cstr->len = ret; + } + if (bigname) { + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + cstr->name + 8, 16); + return (ret == 0) ? 1 : 0; + } } - - tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; - tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL); - if (tmp.name == NULL) - return -ENOMEM; - - ret = ext4_fname_hash(ctx, &iname, &tmp); - if (ret > 0) - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ext4_fname_crypto_free_buffer(&tmp); + if (de->name_len != cstr->len) + return 0; + ret = memcmp(de->name, cstr->name, cstr->len); + return (ret == 0) ? 1 : 0; +errout: + kfree(cstr->name); + cstr->name = NULL; return ret; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2b6e0c84a91194..7551f84f4e53cf 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -251,7 +251,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } else { /* Directory is encrypted */ err = ext4_fname_disk_to_usr(enc_ctx, - de, &fname_crypto_str); + NULL, de, &fname_crypto_str); if (err < 0) goto errout; if (!dir_emit(ctx, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8275763b6a266e..908efb8c6789d5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2085,9 +2085,11 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, u32 ilen, struct ext4_str *crypto_str); int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, @@ -2096,11 +2098,12 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct dx_hash_info *hinfo); -int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_dir_entry_2 *de, - struct dx_hash_info *hinfo); int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen); +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de); + #ifdef CONFIG_EXT4_FS_ENCRYPTION void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 68df8aa1b2ae1c..ca56346f004e21 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -641,7 +641,7 @@ static struct stats dx_show_leaf(struct inode *dir, ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } - res = ext4_fname_disk_to_usr(ctx, de, + res = ext4_fname_disk_to_usr(ctx, NULL, de, &fname_crypto_str); if (res < 0) { printk(KERN_WARNING "Error " @@ -654,15 +654,8 @@ static struct stats dx_show_leaf(struct inode *dir, name = fname_crypto_str.name; len = fname_crypto_str.len; } - res = ext4_fname_disk_to_hash(ctx, de, - &h); - if (res < 0) { - printk(KERN_WARNING "Error " - "converting filename " - "from disk to htree" - "\n"); - h.hash = 0xDEADBEEF; - } + ext4fs_dirhash(de->name, de->name_len, + &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -1009,15 +1002,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - err = ext4_fname_disk_to_hash(ctx, de, hinfo); - if (err < 0) { - count = err; - goto errout; - } -#else ext4fs_dirhash(de->name, de->name_len, hinfo); -#endif if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1033,7 +1018,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, &tmp_str); } else { /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, de, + err = ext4_fname_disk_to_usr(ctx, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1194,26 +1179,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int err; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - err = ext4_fname_disk_to_hash(ctx, de, &h); - if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return err; - } -#else ext4fs_dirhash(de->name, de->name_len, &h); -#endif map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1224,9 +1193,6 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, /* XXX: do we need to check rec_len == 0 case? -Chris */ de = ext4_next_entry(de, blocksize); } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); -#endif return count; } @@ -1288,16 +1254,8 @@ static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) { - /* Directory is encrypted */ - res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str); - if (res < 0) - return res; - if (len != res) - return 0; - res = memcmp(name, fname_crypto_str->name, len); - return (res == 0) ? 1 : 0; - } + if (ctx) + return ext4_fname_match(ctx, fname_crypto_str, len, name, de); #endif if (len != de->name_len) return 0; @@ -1325,16 +1283,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, if (IS_ERR(ctx)) return -1; - if (ctx != NULL) { - /* Allocate buffer to hold maximum name length */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -1; - } - } - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { @@ -1873,14 +1821,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return res; } reclen = EXT4_DIR_REC_LEN(res); - - /* Allocate buffer to hold maximum name length */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -1; - } } de = (struct ext4_dir_entry_2 *)buf; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 3f7227498f0a29..482b244d45464a 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -75,7 +75,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) goto errout; } pstr.name = paddr; - res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr); + res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ From 63db658a03d99e5eadd386c0c54e7c3ac988bd4e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 334/420] ext4 crypto: add padding to filenames before encrypting This obscures the length of the filenames, to decrease the amount of information leakage. By default, we pad the filenames to the next 4 byte boundaries. This costs nothing, since the directory entries are aligned to 4 byte boundaries anyway. Filenames can also be padded to 8, 16, or 32 bytes, which will consume more directory space. Change-Id: I20625aa7b79bbe2cd511933655aac4755fd5273d Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_fname.c | 12 ++++++++++-- fs/ext4/crypto_key.c | 1 + fs/ext4/crypto_policy.c | 14 +++++++++----- fs/ext4/ext4.h | 1 + fs/ext4/ext4_crypto.h | 11 ++++++++++- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 7a877e609e5fba..fded02f7229921 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -66,6 +66,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist sg[1]; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); char *workbuf; if (iname->len <= 0 || iname->len > ctx->lim) @@ -73,6 +74,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; @@ -101,7 +103,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, /* Create encryption request */ sg_init_table(sg, 1); sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); @@ -356,6 +358,7 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( if (IS_ERR(ctx)) return ctx; + ctx->flags = ei->i_crypt_policy_flags; if (ctx->has_valid_key) { if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING @@ -468,6 +471,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen) { u32 ciphertext_len; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); if (ctx == NULL) return -EIO; @@ -475,6 +479,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : namelen; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; return (int) ciphertext_len; @@ -490,10 +495,13 @@ int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, u32 ilen, struct ext4_str *crypto_str) { unsigned int olen; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); if (!ctx) return -EIO; - olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE); + if (padding < EXT4_CRYPTO_BLOCK_SIZE) + padding = EXT4_CRYPTO_BLOCK_SIZE; + olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c8392af8abbbbd..52170d0b7c4036 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -110,6 +110,7 @@ int ext4_generate_encryption_key(struct inode *inode) } res = 0; + ei->i_crypt_policy_flags = ctx.flags; if (S_ISREG(inode->i_mode)) crypt_key->mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 30eaf9e9864a96..a6d6291aea163e 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -37,6 +37,8 @@ static int ext4_is_encryption_context_consistent_with_policy( return 0; return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == + policy->flags) && (ctx.contents_encryption_mode == policy->contents_encryption_mode) && (ctx.filenames_encryption_mode == @@ -56,25 +58,25 @@ static int ext4_create_encryption_context_from_policy( printk(KERN_WARNING "%s: Invalid contents encryption mode %d\n", __func__, policy->contents_encryption_mode); - res = -EINVAL; - goto out; + return -EINVAL; } if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { printk(KERN_WARNING "%s: Invalid filenames encryption mode %d\n", __func__, policy->filenames_encryption_mode); - res = -EINVAL; - goto out; + return -EINVAL; } + if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) + return -EINVAL; ctx.contents_encryption_mode = policy->contents_encryption_mode; ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: if (!res) ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); return res; @@ -115,6 +117,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) policy->version = 0; policy->contents_encryption_mode = ctx.contents_encryption_mode; policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); return 0; @@ -176,6 +179,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) EXT4_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; memset(ctx.master_key_descriptor, 0x42, EXT4_KEY_DESCRIPTOR_SIZE); res = 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 908efb8c6789d5..72abe2956464f9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -920,6 +920,7 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index c2ba35a914b65f..d75159c101ce33 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -20,12 +20,20 @@ struct ext4_encryption_policy { char version; char contents_encryption_mode; char filenames_encryption_mode; + char flags; char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; } __attribute__((__packed__)); #define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 #define EXT4_KEY_DERIVATION_NONCE_SIZE 16 +#define EXT4_POLICY_FLAGS_PAD_4 0x00 +#define EXT4_POLICY_FLAGS_PAD_8 0x01 +#define EXT4_POLICY_FLAGS_PAD_16 0x02 +#define EXT4_POLICY_FLAGS_PAD_32 0x03 +#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 +#define EXT4_POLICY_FLAGS_VALID 0x03 + /** * Encryption context for inode * @@ -41,7 +49,7 @@ struct ext4_encryption_context { char format; char contents_encryption_mode; char filenames_encryption_mode; - char reserved; + char flags; char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; } __attribute__((__packed__)); @@ -120,6 +128,7 @@ struct ext4_fname_crypto_ctx { struct crypto_hash *htfm; struct page *workpage; struct ext4_encryption_key key; + unsigned flags : 8; unsigned has_valid_key : 1; unsigned ctfm_key_is_ready : 1; }; From 4ceb8fc2d74995246a1f507827f37427fa097f90 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 17 Sep 2015 13:34:36 -0400 Subject: [PATCH 335/420] ext4 crypto: Do not select from EXT4_FS_ENCRYPTION This patch adds a tristate EXT4_ENCRYPTION to do the selections for EXT4_FS_ENCRYPTION because selecting from a bool causes all the selected options to be built-in, even if EXT4 itself is a module. Change-Id: I431434e48f96728efa64f324974825848f34718a Signed-off-by: Herbert Xu Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 18228c201f7f4c..024f2284d3f6c0 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -64,8 +64,8 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4_FS_ENCRYPTION - bool "Ext4 Encryption" +config EXT4_ENCRYPTION + tristate "Ext4 Encryption" depends on EXT4_FS select CRYPTO_AES select CRYPTO_CBC @@ -81,6 +81,11 @@ config EXT4_FS_ENCRYPTION efficient since it avoids caching the encrypted and decrypted pages in the page cache. +config EXT4_FS_ENCRYPTION + bool + default y + depends on EXT4_ENCRYPTION + config EXT4_DEBUG bool "EXT4 debugging support" depends on EXT4_FS From 3ee27f97050c9013fe771b1730a335296c41f03e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:30 -0400 Subject: [PATCH 336/420] ext4 crypto: optimize filename encryption Encrypt the filename as soon it is passed in by the user. This avoids our needing to encrypt the filename 2 or 3 times while in the process of creating a filename. Similarly, when looking up a directory entry, encrypt the filename early, or if the encryption key is not available, base-64 decode the file syystem so that the hash value and the last 16 bytes of the encrypted filename is available in the new struct ext4_filename data structure. Change-Id: I6b77f282b78d53431a1bf5659cd079a52961940f Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_fname.c | 155 +++++++++------------ fs/ext4/ext4.h | 63 ++++++--- fs/ext4/inline.c | 31 ++--- fs/ext4/namei.c | 305 ++++++++++++++++------------------------- 4 files changed, 237 insertions(+), 317 deletions(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index fded02f7229921..ad5e32867de051 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -611,109 +611,82 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; } -/* - * Calculate the htree hash from a filename from user space - */ -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo) +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) { - struct ext4_str tmp; - int ret = 0; - char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; + struct ext4_fname_crypto_ctx *ctx; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct ext4_filename)); + fname->usr_fname = iname; - if (!ctx || + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + if ((ctx == NULL) || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { - ext4fs_dirhash(iname->name, iname->len, hinfo); - return 0; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + goto out; } - - if (!ctx->has_valid_key && iname->name[0] == '_') { - if (iname->len != 33) - return -ENOENT; - ret = digest_decode(iname->name+1, iname->len, buf); - if (ret != 24) - return -ENOENT; - memcpy(&hinfo->hash, buf, 4); - memcpy(&hinfo->minor_hash, buf + 4, 4); - return 0; + if (ctx->has_valid_key) { + ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, + &fname->crypto_buf); + if (ret < 0) + goto out; + ret = ext4_fname_encrypt(ctx, iname, &fname->crypto_buf); + if (ret < 0) + goto out; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + ret = 0; + goto out; } - - if (!ctx->has_valid_key && iname->name[0] != '_') { - if (iname->len > 43) - return -ENOENT; - ret = digest_decode(iname->name, iname->len, buf); - ext4fs_dirhash(buf, ret, hinfo); - return 0; + if (!lookup) { + ret = -EACCES; + goto out; } - /* First encrypt the plaintext name */ - ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); - if (ret < 0) - return ret; - - ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret >= 0) { - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ret = 0; + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) { + ret = -ENOENT; } - - ext4_fname_crypto_free_buffer(&tmp); + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) { + ret = -ENOMEM; + goto out; + } + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto out; + } + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); + memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + ret = 0; +out: + ext4_put_fname_crypto_ctx(&ctx); return ret; } -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de) +void ext4_fname_free_filename(struct ext4_filename *fname) { - int ret = -ENOENT; - int bigname = (*name == '_'); - - if (ctx->has_valid_key) { - if (cstr->name == NULL) { - struct qstr istr; - - ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); - if (ret < 0) - goto errout; - istr.name = name; - istr.len = len; - ret = ext4_fname_encrypt(ctx, &istr, cstr); - if (ret < 0) - goto errout; - } - } else { - if (cstr->name == NULL) { - cstr->name = kmalloc(32, GFP_KERNEL); - if (cstr->name == NULL) - return -ENOMEM; - if ((bigname && (len != 33)) || - (!bigname && (len > 43))) - goto errout; - ret = digest_decode(name+bigname, len-bigname, - cstr->name); - if (ret < 0) { - ret = -ENOENT; - goto errout; - } - cstr->len = ret; - } - if (bigname) { - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - cstr->name + 8, 16); - return (ret == 0) ? 1 : 0; - } - } - if (de->name_len != cstr->len) - return 0; - ret = memcmp(de->name, cstr->name, cstr->len); - return (ret == 0) ? 1 : 0; -errout: - kfree(cstr->name); - cstr->name = NULL; - return ret; + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 72abe2956464f9..84f5b53889fb2f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1836,6 +1836,17 @@ struct dx_hash_info */ #define HASH_NB_ALWAYS 1 +struct ext4_filename { + const struct qstr *usr_fname; + struct ext4_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_str crypto_buf; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory @@ -2096,21 +2107,16 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct ext4_str *oname); -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo); int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen); -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de); - - #ifdef CONFIG_EXT4_FS_ENCRYPTION void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, u32 max_len); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); +void ext4_fname_free_filename(struct ext4_filename *fname); #else static inline void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } @@ -2121,6 +2127,16 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, return NULL; } static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; +} +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #endif @@ -2154,14 +2170,13 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const struct qstr *iname, - const char *name, int namelen); + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, @@ -2316,13 +2331,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); -extern int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, @@ -2768,7 +2784,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, @@ -2782,6 +2800,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1805b51e2a0d58..6990a6fb42ddfc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -991,20 +991,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_filename *fname, struct dentry *dentry, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, - inline_start, inline_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + inline_size, fname, &de); if (err) return err; @@ -1012,8 +1010,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, - name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1244,8 +1241,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ -int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { int ret, inline_size; void *inline_start; @@ -1264,7 +1261,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1285,8 +1282,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); + ret = ext4_add_dirent_to_inline(handle, fname, dentry, + inode, &iloc, inline_start, + inline_size); if (ret != -ENOSPC) goto out; @@ -1607,6 +1605,7 @@ int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, } struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) @@ -1628,8 +1627,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1641,8 +1640,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ca56346f004e21..6ac22f774eb3d5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -249,7 +249,7 @@ static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(const struct qstr *d_name, +static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); @@ -268,10 +268,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, + struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -725,7 +725,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(const struct qstr *d_name, struct inode *dir, +dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; @@ -747,32 +747,14 @@ dx_probe(const struct qstr *d_name, struct inode *dir, root->info.hash_version); goto fail; } + if (fname) + hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (d_name) { - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; - - /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - ret_err = ERR_PTR(PTR_ERR(ctx)); - goto fail; - } - res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); - if (res < 0) { - ret_err = ERR_PTR(res); - goto fail; - } - ext4_put_fname_crypto_ctx(&ctx); - } -#else - if (d_name) - ext4fs_dirhash(d_name->name, d_name->len, hinfo); -#endif + if (fname && fname_name(fname)) + ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -1156,12 +1138,13 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { - return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - d_name, offset, res_dir); + return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + fname, d_name, offset, res_dir); } /* @@ -1243,54 +1226,54 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, - struct ext4_str *fname_crypto_str, - int len, const char * const name, +static inline int ext4_match(struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { - int res; + const void *name = fname_name(fname); + u32 len = fname_len(fname); if (!de->inode) return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) - return ext4_fname_match(ctx, fname_crypto_str, len, name, de); + if (unlikely(!name)) { + if (fname->usr_fname->name[0] == '_') { + int ret; + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + fname->crypto_buf.name + 8, 16); + return (ret == 0) ? 1 : 0; + } + name = fname->crypto_buf.name; + len = fname->crypto_buf.len; + } #endif - if (len != de->name_len) + if (de->name_len != len) return 0; - res = memcmp(name, de->name, len); - return (res == 0) ? 1 : 0; + return (memcmp(de->name, name, len) == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, - struct inode *dir, const struct qstr *d_name, - unsigned int offset, struct ext4_dir_entry_2 **res_dir) +int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = d_name->name; - int namelen = d_name->len; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(ctx, &fname_crypto_str, namelen, - name, de); + res = ext4_match(fname, de); if (res < 0) { res = -1; goto return_result; @@ -1323,8 +1306,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, res = 0; return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1371,7 +1352,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, namelen; + int i, namelen, retval; + struct ext4_filename fname; *res_dir = NULL; sb = dir->i_sb; @@ -1379,14 +1361,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; + retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval) + return ERR_PTR(retval); + if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) *inlined = 1; - return ret; + goto cleanup_and_exit; } } @@ -1401,14 +1387,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir); + ret = ext4_dx_find_entry(dir, &fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) - return bh; + if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); } @@ -1439,8 +1425,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, num++; bh = ext4_getblk(NULL, dir, b++, 0); if (unlikely(IS_ERR(bh))) { - if (ra_max == 0) - return bh; + if (ra_max == 0) { + ret = bh; + goto cleanup_and_exit; + } break; } bh_use[ra_max] = bh; @@ -1470,7 +1458,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, d_name, + i = search_dirblock(bh, dir, &fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1501,15 +1489,17 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); + ext4_fname_free_filename(&fname); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir) +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_hash_info hinfo; struct dx_frame frames[2], *frame; + const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1517,7 +1507,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q #ifdef CONFIG_EXT4_FS_ENCRYPTION *res_dir = NULL; #endif - frame = dx_probe(d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; do { @@ -1526,7 +1516,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, d_name, + retval = search_dirblock(bh, dir, fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1538,7 +1528,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q } /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hinfo.hash, frame, + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning(sb, @@ -1797,32 +1787,16 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); int nlen, rlen; unsigned int offset = 0; char *top; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - - if (ctx != NULL) { - /* Calculate record length needed to store the entry */ - res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return res; - } - reclen = EXT4_DIR_REC_LEN(res); - } - de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { @@ -1832,7 +1806,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, goto return_result; } /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + res = ext4_match(fname, de); if (res < 0) goto return_result; if (res > 0) { @@ -1854,8 +1828,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, res = 0; } return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1863,39 +1835,10 @@ int ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, - const struct qstr *iname, - const char *name, int namelen) + struct ext4_filename *fname) { int nlen, rlen; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_str tmp_str; - int res; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -EIO; - /* By default, the input name would be written to the disk */ - tmp_str.name = (unsigned char *)name; - tmp_str.len = namelen; - if (ctx != NULL) { - /* Directory is encrypted */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -ENOMEM; - } - res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); - return res; - } - tmp_str.name = fname_crypto_str.name; - tmp_str.len = fname_crypto_str.len; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); @@ -1909,11 +1852,8 @@ int ext4_insert_dentry(struct inode *dir, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = tmp_str.len; - - memcpy(de->name, tmp_str.name, tmp_str.len); - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); return 0; } @@ -1925,13 +1865,11 @@ int ext4_insert_dentry(struct inode *dir, * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, +static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -1940,9 +1878,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, - bh, bh->b_data, blocksize - csum_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + blocksize - csum_size, fname, &de); if (err) return err; } @@ -1955,8 +1892,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling. Due to crypto operations, * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, - name, namelen); + err = ext4_insert_dentry(dir, inode, de, blocksize, fname); if (err < 0) return err; @@ -1986,17 +1922,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, +static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { struct inode *dir = dentry->d_parent->d_inode; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; -#else - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; -#endif struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2007,17 +1937,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, unsigned len; int retval; unsigned blocksize; - struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif - if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2079,22 +2002,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_mark_inode_dirty(handle, dir); - brelse(bh); - return res; - } - ext4_put_fname_crypto_ctx(&ctx); -#else - ext4fs_dirhash(name, namelen, &hinfo); -#endif + fname->hinfo.hash_version = root->info.hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo); + memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; @@ -2109,14 +2022,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &hinfo); + de = do_split(handle,dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } dx_release(frames); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); brelse(bh); return retval; out_frames: @@ -2148,6 +2061,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; struct super_block *sb; + struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; @@ -2162,10 +2076,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + if (retval) + return retval; + if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, + dentry, inode); if (retval < 0) - return retval; + goto out; if (retval == 1) { retval = 0; return retval; @@ -2173,7 +2092,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) return retval; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2183,23 +2102,32 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) - return PTR_ERR(bh); - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) { - brelse(bh); - return retval; + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; } + retval = add_dirent_to_buf(handle, &fname, dir, inode, + NULL, bh); + if (retval != -ENOSPC) + goto out; + if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, &fname, dentry, + inode, bh); + bh = NULL; /* make_indexed_dir releases bh */ + goto out; + } brelse(bh); } bh = ext4_append(handle, dir, &block); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -2209,7 +2137,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, initialize_dirent_tail(t, blocksize); } - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); +out: + ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -2219,19 +2149,18 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, /* * Returns 0 for success, or a negative error value */ -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; struct buffer_head *bh; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; @@ -2248,7 +2177,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; @@ -2344,12 +2273,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo); + de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } - err = add_dirent_to_buf(handle, dentry, inode, de, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: From b97e2d7c67ea258cc91a6c250266296a112ff25c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:35 -0400 Subject: [PATCH 337/420] ext4 crypto: don't allocate a page when encrypting/decrypting file names Change-Id: I333d2e3989411cb47e7b1e10250f00e973c08c6c Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_fname.c | 72 ++++++++++++------------------------------ fs/ext4/dir.c | 3 ++ fs/ext4/ext4_crypto.h | 2 -- fs/ext4/namei.c | 4 +++ fs/ext4/symlink.c | 1 + 5 files changed, 28 insertions(+), 54 deletions(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index ad5e32867de051..23d7f1d56b007e 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -65,9 +65,9 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, struct crypto_ablkcipher *tfm = ctx->ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - struct scatterlist sg[1]; + struct scatterlist src_sg, dst_sg; int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - char *workbuf; + char *workbuf, buf[32], *alloc_buf = NULL; if (iname->len <= 0 || iname->len > ctx->lim) return -EIO; @@ -78,20 +78,27 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } + /* Allocate request */ req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); return -ENOMEM; } ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - /* Copy the input */ memcpy(workbuf, iname->name, iname->len); if (iname->len < ciphertext_len) @@ -101,21 +108,16 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, ciphertext_len); - res = ciphertext_len; - } - kunmap(ctx->workpage); + kfree(alloc_buf); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -139,11 +141,10 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, struct ext4_str tmp_in[2], tmp_out[1]; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist sg[1]; + struct scatterlist src_sg, dst_sg; struct crypto_ablkcipher *tfm = ctx->ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - char *workbuf; if (iname->len <= 0 || iname->len > ctx->lim) return -EIO; @@ -163,31 +164,19 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - /* Initialize IV */ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, iname->len); - res = iname->len; - } - kunmap(ctx->workpage); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -267,8 +256,6 @@ void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) crypto_free_ablkcipher(ctx->ctfm); if (ctx->htfm && !IS_ERR(ctx->htfm)) crypto_free_hash(ctx->htfm); - if (ctx->workpage && !IS_ERR(ctx->workpage)) - __free_page(ctx->workpage); kfree(ctx); } @@ -322,7 +309,6 @@ struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( ctx->ctfm_key_is_ready = 0; ctx->ctfm = NULL; ctx->htfm = NULL; - ctx->workpage = NULL; return ctx; } @@ -390,24 +376,6 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( ext4_put_fname_crypto_ctx(&ctx); return ERR_PTR(-ENOMEM); } - if (ctx->workpage == NULL) - ctx->workpage = alloc_page(GFP_NOFS); - if (IS_ERR(ctx->workpage)) { - res = PTR_ERR(ctx->workpage); - printk( - KERN_DEBUG "%s: error (%d) allocating work page\n", - __func__, res); - ctx->workpage = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->workpage == NULL) { - printk( - KERN_DEBUG "%s: could not allocate work page\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } ctx->lim = max_ciphertext_len; crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 7551f84f4e53cf..9f44f842f74b9e 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -249,9 +249,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) get_dtype(sb, de->file_type))) goto done; } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ err = ext4_fname_disk_to_usr(enc_ctx, NULL, de, &fname_crypto_str); + fname_crypto_str.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index d75159c101ce33..552424ae3ab33a 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -123,10 +123,8 @@ struct ext4_str { struct ext4_fname_crypto_ctx { u32 lim; - char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; struct crypto_ablkcipher *ctfm; struct crypto_hash *htfm; - struct page *workpage; struct ext4_encryption_key key; unsigned flags : 8; unsigned has_valid_key : 1; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6ac22f774eb3d5..d63f58cec86b9f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -999,6 +999,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ err = ext4_fname_disk_to_usr(ctx, hinfo, de, &fname_crypto_str); @@ -1009,6 +1011,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); + fname_crypto_str.len = save_len; } if (err != 0) { count = err; @@ -3129,6 +3132,7 @@ static int ext4_symlink(struct inode *dir, istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); ext4_put_fname_crypto_ctx(&ctx); if (err < 0) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 482b244d45464a..e853eabd9ed7fb 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -75,6 +75,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) goto errout; } pstr.name = paddr; + pstr.len = plen; res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); if (res < 0) goto errout; From 0d8cf6faaf2f5acb3d93146a2960612e7315e883 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:36 -0400 Subject: [PATCH 338/420] ext4 crypto: separate kernel and userspace structure for the key Use struct ext4_encryption_key only for the master key passed via the kernel keyring. For internal kernel space users, we now use struct ext4_crypt_info. This will allow us to put information from the policy structure so we can cache it and avoid needing to constantly looking up the extended attribute. We will do this in a spearate patch. This patch is mostly mechnical to make it easier for patch review. Change-Id: I51c01abecb64308ee1039f318c24e42b6ad06f5b Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 18 +++++++++--------- fs/ext4/crypto_fname.c | 33 ++++++++++----------------------- fs/ext4/crypto_key.c | 21 +++++++++++---------- fs/ext4/ext4.h | 2 +- fs/ext4/ext4_crypto.h | 15 +++++++++++---- fs/ext4/super.c | 2 +- 6 files changed, 43 insertions(+), 48 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 8ff15273ab0cc0..918200ed9bf8af 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -118,7 +118,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypto_ctx *ctx = NULL; int res = 0; unsigned long flags; - struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + struct ext4_crypt_info *ci = &EXT4_I(inode)->i_crypt_info; if (!ext4_read_workqueue) ext4_init_crypto(); @@ -152,14 +152,14 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) /* Allocate a new Crypto API context if we don't already have * one or if it isn't the right mode. */ - BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); - if (ctx->tfm && (ctx->mode != key->mode)) { + BUG_ON(ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID); + if (ctx->tfm && (ctx->mode != ci->ci_mode)) { crypto_free_tfm(ctx->tfm); ctx->tfm = NULL; ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; } if (!ctx->tfm) { - switch (key->mode) { + switch (ci->ci_mode) { case EXT4_ENCRYPTION_MODE_AES_256_XTS: ctx->tfm = crypto_ablkcipher_tfm( crypto_alloc_ablkcipher("xts(aes)", 0, 0)); @@ -177,9 +177,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) ctx->tfm = NULL; goto out; } - ctx->mode = key->mode; + ctx->mode = ci->ci_mode; } - BUG_ON(key->size != ext4_encryption_key_size(key->mode)); + BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_mode)); /* There shouldn't be a bounce page attached to the crypto * context at this point. */ @@ -322,7 +322,7 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, int res = 0; BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_encryption_key.mode); + BUG_ON(ctx->mode != ei->i_crypt_info.ci_mode); if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { printk_ratelimited(KERN_ERR @@ -334,8 +334,8 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, crypto_ablkcipher_clear_flags(atfm, ~0); crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, - ei->i_encryption_key.size); + res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info.ci_raw, + ei->i_crypt_info.ci_size); if (res) { printk_ratelimited(KERN_ERR "%s: crypto_ablkcipher_setkey() failed\n", diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 23d7f1d56b007e..d9f08ddbfda257 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -277,34 +277,25 @@ void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) *ctx = NULL; } -/** - * ext4_search_fname_crypto_ctx() - - */ -static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( - const struct ext4_encryption_key *key) -{ - return NULL; -} - /** * ext4_alloc_fname_crypto_ctx() - */ struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( - const struct ext4_encryption_key *key) + const struct ext4_crypt_info *ci) { struct ext4_fname_crypto_ctx *ctx; ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); if (ctx == NULL) return ERR_PTR(-ENOMEM); - if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { + if (ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID) { /* This will automatically set key mode to invalid * As enum for ENCRYPTION_MODE_INVALID is zero */ - memset(&ctx->key, 0, sizeof(ctx->key)); + memset(&ctx->ci, 0, sizeof(ctx->ci)); } else { - memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); + memcpy(&ctx->ci, ci, sizeof(struct ext4_crypt_info)); } - ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) + ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == ci->ci_mode) ? 0 : 1; ctx->ctfm_key_is_ready = 0; ctx->ctfm = NULL; @@ -335,21 +326,17 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( if (!ext4_has_encryption_key(inode)) ext4_generate_encryption_key(inode); - /* Get a crypto context based on the key. - * A new context is allocated if no context matches the requested key. - */ - ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); - if (ctx == NULL) - ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); + /* Get a crypto context based on the key. */ + ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_crypt_info)); if (IS_ERR(ctx)) return ctx; ctx->flags = ei->i_crypt_policy_flags; if (ctx->has_valid_key) { - if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + if (ctx->ci.ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", - ctx->key.mode); + ctx->ci.ci_mode); return ERR_PTR(-ENOKEY); } @@ -389,7 +376,7 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( * are pretty weak, * we directly use the inode master key */ res = crypto_ablkcipher_setkey(ctx->ctfm, - ctx->key.raw, ctx->key.size); + ctx->ci.ci_raw, ctx->ci.ci_size); if (res) { ext4_put_fname_crypto_ctx(&ctx); return ERR_PTR(-EIO); diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 52170d0b7c4036..ec6635dc50f9d0 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -91,7 +91,7 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], int ext4_generate_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; struct key *keyring_key = NULL; @@ -112,17 +112,17 @@ int ext4_generate_encryption_key(struct inode *inode) ei->i_crypt_policy_flags = ctx.flags; if (S_ISREG(inode->i_mode)) - crypt_key->mode = ctx.contents_encryption_mode; + crypt_info->ci_mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_key->mode = ctx.filenames_encryption_mode; + crypt_info->ci_mode = ctx.filenames_encryption_mode; else { printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); BUG(); } - crypt_key->size = ext4_encryption_key_size(crypt_key->mode); - BUG_ON(!crypt_key->size); + crypt_info->ci_size = ext4_encryption_key_size(crypt_info->ci_mode); + BUG_ON(!crypt_info->ci_size); if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + memset(crypt_info->ci_raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); goto out; } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, @@ -148,19 +148,20 @@ int ext4_generate_encryption_key(struct inode *inode) BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, + crypt_info->ci_raw); out: if (keyring_key) key_put(keyring_key); if (res < 0) - crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + crypt_info->ci_mode = EXT4_ENCRYPTION_MODE_INVALID; return res; } int ext4_has_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; - return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); + return (crypt_info->ci_mode != EXT4_ENCRYPTION_MODE_INVALID); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 84f5b53889fb2f..68259da3ac74c9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -960,7 +960,7 @@ struct ext4_inode_info { #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Encryption params */ - struct ext4_encryption_key i_encryption_key; + struct ext4_crypt_info i_crypt_info; #endif }; diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 552424ae3ab33a..deecbe8968d1e9 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -66,10 +66,17 @@ struct ext4_encryption_context { #define EXT4_KEY_DESC_PREFIX "ext4:" #define EXT4_KEY_DESC_PREFIX_SIZE 5 +/* This is passed in from userspace into the kernel keyring */ struct ext4_encryption_key { - uint32_t mode; - char raw[EXT4_MAX_KEY_SIZE]; - uint32_t size; + __u32 mode; + char raw[EXT4_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct ext4_crypt_info { + unsigned char ci_mode; + unsigned char ci_size; + char ci_raw[EXT4_MAX_KEY_SIZE]; }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 @@ -125,7 +132,7 @@ struct ext4_fname_crypto_ctx { u32 lim; struct crypto_ablkcipher *ctfm; struct crypto_hash *htfm; - struct ext4_encryption_key key; + struct ext4_crypt_info ci; unsigned flags : 8; unsigned has_valid_key : 1; unsigned ctfm_key_is_ready : 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a1490673e46b54..ecf54cceb1d5f4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -904,7 +904,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; + ei->i_crypt_info.ci_mode = EXT4_ENCRYPTION_MODE_INVALID; #endif return &ei->vfs_inode; From af081765626bf1f04cb1b65c6e9105c8de70f438 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:36 -0400 Subject: [PATCH 339/420] ext4 crypto: reorganize how we store keys in the inode This is a pretty massive patch which does a number of different things: 1) The per-inode encryption information is now stored in an allocated data structure, ext4_crypt_info, instead of directly in the node. This reduces the size usage of an in-memory inode when it is not using encryption. 2) We drop the ext4_fname_crypto_ctx entirely, and use the per-inode encryption structure instead. This remove an unnecessary memory allocation and free for the fname_crypto_ctx as well as allowing us to reuse the ctfm in a directory for multiple lookups and file creations. 3) We also cache the inode's policy information in the ext4_crypt_info structure so we don't have to continually read it out of the extended attributes. 4) We now keep the keyring key in the inode's encryption structure instead of releasing it after we are done using it to derive the per-inode key. This allows us to test to see if the key has been revoked; if it has, we prevent the use of the derived key and free it. 5) When an inode is released (or when the derived key is freed), we will use memset_explicit() to zero out the derived key, so it's not left hanging around in memory. This implies that when a user logs out, it is important to first revoke the key, and then unlink it, and then finally, to use "echo 3 > /proc/sys/vm/drop_caches" to release any decrypted pages and dcache entries from the system caches. 6) All this, and we also shrink the number of lines of code by around 100. :-) Change-Id: Ide8129a31ac04cb4905a76600e66bf9f7ef2d2bd Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 9 +- fs/ext4/crypto_fname.c | 284 +++++++++++----------------------------- fs/ext4/crypto_key.c | 74 ++++++++--- fs/ext4/crypto_policy.c | 76 ++++++----- fs/ext4/dir.c | 21 ++- fs/ext4/ext4.h | 44 ++++--- fs/ext4/ext4_crypto.h | 16 +-- fs/ext4/file.c | 4 +- fs/ext4/namei.c | 42 ++---- fs/ext4/super.c | 7 +- fs/ext4/symlink.c | 15 +-- 11 files changed, 246 insertions(+), 346 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 918200ed9bf8af..3a25aa4f3d9457 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -118,8 +118,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypto_ctx *ctx = NULL; int res = 0; unsigned long flags; - struct ext4_crypt_info *ci = &EXT4_I(inode)->i_crypt_info; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + BUG_ON(ci == NULL); if (!ext4_read_workqueue) ext4_init_crypto(); @@ -322,7 +323,7 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, int res = 0; BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_crypt_info.ci_mode); + BUG_ON(ctx->mode != ei->i_crypt_info->ci_mode); if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { printk_ratelimited(KERN_ERR @@ -334,8 +335,8 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, crypto_ablkcipher_clear_flags(atfm, ~0); crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info.ci_raw, - ei->i_crypt_info.ci_size); + res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info->ci_raw, + ei->i_crypt_info->ci_size); if (res) { printk_ratelimited(KERN_ERR "%s: crypto_ablkcipher_setkey() failed\n", diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index d9f08ddbfda257..374d0e790315e8 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -48,6 +48,12 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); } +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + /** * ext4_fname_encrypt() - * @@ -55,28 +61,30 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_encrypt(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { u32 ciphertext_len; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; + ciphertext_len = (ciphertext_len > lim) + ? lim : ciphertext_len; if (ciphertext_len <= sizeof(buf)) { workbuf = buf; @@ -134,7 +142,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_decrypt(struct inode *inode, const struct ext4_str *iname, struct ext4_str *oname) { @@ -142,11 +150,13 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; tmp_in[0].name = iname->name; @@ -242,171 +252,50 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -/** - * ext4_free_fname_crypto_ctx() - - * - * Frees up a crypto context. - */ -void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) -{ - if (ctx == NULL || IS_ERR(ctx)) - return; - - if (ctx->ctfm && !IS_ERR(ctx->ctfm)) - crypto_free_ablkcipher(ctx->ctfm); - if (ctx->htfm && !IS_ERR(ctx->htfm)) - crypto_free_hash(ctx->htfm); - kfree(ctx); -} - -/** - * ext4_put_fname_crypto_ctx() - - * - * Return: The crypto context onto free list. If the free list is above a - * threshold, completely frees up the context, and returns the memory. - * - * TODO: Currently we directly free the crypto context. Eventually we should - * add code it to return to free list. Such an approach will increase - * efficiency of directory lookup. - */ -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) +int ext4_setup_fname_crypto(struct inode *inode) { - if (*ctx == NULL || IS_ERR(*ctx)) - return; - ext4_free_fname_crypto_ctx(*ctx); - *ctx = NULL; -} - -/** - * ext4_alloc_fname_crypto_ctx() - - */ -struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( - const struct ext4_crypt_info *ci) -{ - struct ext4_fname_crypto_ctx *ctx; - - ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); - if (ctx == NULL) - return ERR_PTR(-ENOMEM); - if (ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID) { - /* This will automatically set key mode to invalid - * As enum for ENCRYPTION_MODE_INVALID is zero */ - memset(&ctx->ci, 0, sizeof(ctx->ci)); - } else { - memcpy(&ctx->ci, ci, sizeof(struct ext4_crypt_info)); - } - ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == ci->ci_mode) - ? 0 : 1; - ctx->ctfm_key_is_ready = 0; - ctx->ctfm = NULL; - ctx->htfm = NULL; - return ctx; -} - -/** - * ext4_get_fname_crypto_ctx() - - * - * Allocates a free crypto context and initializes it to hold - * the crypto material for the inode. - * - * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. - */ -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( - struct inode *inode, u32 max_ciphertext_len) -{ - struct ext4_fname_crypto_ctx *ctx; struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *ci = ei->i_crypt_info; + struct crypto_ablkcipher *ctfm; int res; /* Check if the crypto policy is set on the inode */ res = ext4_encrypted_inode(inode); if (res == 0) - return NULL; - - if (!ext4_has_encryption_key(inode)) - ext4_generate_encryption_key(inode); - - /* Get a crypto context based on the key. */ - ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_crypt_info)); - if (IS_ERR(ctx)) - return ctx; - - ctx->flags = ei->i_crypt_policy_flags; - if (ctx->has_valid_key) { - if (ctx->ci.ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { - printk_once(KERN_WARNING - "ext4: unsupported key mode %d\n", - ctx->ci.ci_mode); - return ERR_PTR(-ENOKEY); - } + return 0; - /* As a first cut, we will allocate new tfm in every call. - * later, we will keep the tfm around, in case the key gets - * re-used */ - if (ctx->ctfm == NULL) { - ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", - 0, 0); - } - if (IS_ERR(ctx->ctfm)) { - res = PTR_ERR(ctx->ctfm); - printk( - KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", - __func__, res); - ctx->ctfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->ctfm == NULL) { - printk( - KERN_DEBUG "%s: could not allocate crypto tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - ctx->lim = max_ciphertext_len; - crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - - /* If we are lucky, we will get a context that is already - * set up with the right key. Else, we will have to - * set the key */ - if (!ctx->ctfm_key_is_ready) { - /* Since our crypto objectives for filename encryption - * are pretty weak, - * we directly use the inode master key */ - res = crypto_ablkcipher_setkey(ctx->ctfm, - ctx->ci.ci_raw, ctx->ci.ci_size); - if (res) { - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-EIO); - } - ctx->ctfm_key_is_ready = 1; - } else { - /* In the current implementation, key should never be - * marked "ready" for a context that has just been - * allocated. So we should never reach here */ - BUG(); - } - } - if (ctx->htfm == NULL) - ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ctx->htfm)) { - res = PTR_ERR(ctx->htfm); - printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", - __func__, res); - ctx->htfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); + res = ext4_get_encryption_info(inode); + if (res < 0) + return res; + ci = ei->i_crypt_info; + + if (!ci || ci->ci_ctfm) + return 0; + + if (ci->ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", + ci->ci_mode); + return -ENOKEY; } - if (ctx->htfm == NULL) { - printk(KERN_DEBUG "%s: could not allocate hash tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); + + ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", + __func__, res); + return res; } + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); - return ctx; + res = crypto_ablkcipher_setkey(ctfm, ci->ci_raw, ci->ci_size); + if (res) { + crypto_free_ablkcipher(ctfm); + return -EIO; + } + ci->ci_ctfm = ctfm; + return 0; } /** @@ -419,41 +308,21 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) return ((size+blksize-1)/blksize)*blksize; } -/** - * ext4_fname_crypto_namelen_on_disk() - - */ -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen) -{ - u32 ciphertext_len; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - - if (ctx == NULL) - return -EIO; - if (!(ctx->has_valid_key)) - return -EACCES; - ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? - EXT4_CRYPTO_BLOCK_SIZE : namelen; - ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; - return (int) ciphertext_len; -} - /** * ext4_fname_crypto_alloc_obuff() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str) { unsigned int olen; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + int padding = 16; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (!ctx) - return -EIO; + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); if (padding < EXT4_CRYPTO_BLOCK_SIZE) padding = EXT4_CRYPTO_BLOCK_SIZE; olen = ext4_fname_crypto_round_up(ilen, padding); @@ -484,7 +353,7 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) /** * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname) @@ -492,8 +361,6 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, char buf[24]; int ret; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { @@ -503,8 +370,8 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) - return ext4_fname_decrypt(ctx, iname, oname); + if (EXT4_I(inode)->i_crypt_info) + return ext4_fname_decrypt(inode, iname, oname); if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); @@ -523,7 +390,7 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return ret + 1; } -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) @@ -531,21 +398,20 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); + return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); } /** * ext4_fname_usr_to_disk() - converts a filename from user space to disk space */ -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { int res; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && @@ -556,8 +422,8 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) { - res = ext4_fname_encrypt(ctx, iname, oname); + if (ci) { + res = ext4_fname_encrypt(inode, iname, oname); return res; } /* Without a proper key, a user is not allowed to modify the filenames @@ -569,16 +435,13 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { - struct ext4_fname_crypto_ctx *ctx; + struct ext4_crypt_info *ci; int ret = 0, bigname = 0; memset(fname, 0, sizeof(struct ext4_filename)); fname->usr_fname = iname; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - if ((ctx == NULL) || + if (!ext4_encrypted_inode(dir) || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { @@ -586,12 +449,16 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; goto out; } - if (ctx->has_valid_key) { - ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, + ret = ext4_setup_fname_crypto(dir); + if (ret) + return ret; + ci = EXT4_I(dir)->i_crypt_info; + if (ci) { + ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, &fname->crypto_buf); if (ret < 0) goto out; - ret = ext4_fname_encrypt(ctx, iname, &fname->crypto_buf); + ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) goto out; fname->disk_name.name = fname->crypto_buf.name; @@ -634,7 +501,6 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, } ret = 0; out: - ext4_put_fname_crypto_ctx(&ctx); return ret; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index ec6635dc50f9d0..0075e43ffea61e 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -84,14 +84,26 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], return res; } -/** - * ext4_generate_encryption_key() - generates an encryption key - * @inode: The inode to generate the encryption key for. - */ -int ext4_generate_encryption_key(struct inode *inode) +void ext4_free_encryption_info(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *ci = ei->i_crypt_info; + + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + memzero_explicit(&ci->ci_raw, sizeof(ci->ci_raw)); + kfree(ci); + ei->i_crypt_info = NULL; +} + +int _ext4_get_encryption_info(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; + struct ext4_crypt_info *crypt_info; char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; struct key *keyring_key = NULL; @@ -99,18 +111,40 @@ int ext4_generate_encryption_key(struct inode *inode) struct ext4_encryption_context ctx; struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + int res; - if (res != sizeof(ctx)) { - if (res > 0) - res = -EINVAL; - goto out; + if (ei->i_crypt_info) { + if (!ei->i_crypt_info->ci_keyring_key || + key_validate(ei->i_crypt_info->ci_keyring_key) == 0) + return 0; + ext4_free_encryption_info(inode); } + + res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res < 0) { + if (!DUMMY_ENCRYPTION_ENABLED(sbi)) + return res; + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) + return -EINVAL; res = 0; + crypt_info = kmalloc(sizeof(struct ext4_crypt_info), GFP_KERNEL); + if (!crypt_info) + return -ENOMEM; + ei->i_crypt_policy_flags = ctx.flags; + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) crypt_info->ci_mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) @@ -151,17 +185,23 @@ int ext4_generate_encryption_key(struct inode *inode) res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_info->ci_raw); out: + if (res < 0) { + if (res == -ENOKEY) + res = 0; + kfree(crypt_info); + } else { + ei->i_crypt_info = crypt_info; + crypt_info->ci_keyring_key = keyring_key; + keyring_key = NULL; + } if (keyring_key) key_put(keyring_key); - if (res < 0) - crypt_info->ci_mode = EXT4_ENCRYPTION_MODE_INVALID; return res; } int ext4_has_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *crypt_info = &ei->i_crypt_info; - return (crypt_info->ci_mode != EXT4_ENCRYPTION_MODE_INVALID); + return (ei->i_crypt_info != NULL); } diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a6d6291aea163e..370d3aa0a9cf95 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -126,7 +126,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) int ext4_is_child_context_consistent_with_parent(struct inode *parent, struct inode *child) { - struct ext4_encryption_context parent_ctx, child_ctx; + struct ext4_crypt_info *parent_ci, *child_ci; int res; if ((parent == NULL) || (child == NULL)) { @@ -136,26 +136,28 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) return 1; - res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx)); - if (res != sizeof(parent_ctx)) - return 0; /* if the child directory is not encrypted, this is always a problem */ if (!ext4_encrypted_inode(child)) return 0; - res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx)); - if (res != sizeof(child_ctx)) + res = ext4_get_encryption_info(parent); + if (res) + return 0; + res = ext4_get_encryption_info(child); + if (res) + return 0; + parent_ci = EXT4_I(parent)->i_crypt_info; + child_ci = EXT4_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) return 0; - return (memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode)); + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); } /** @@ -168,31 +170,37 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, int ext4_inherit_context(struct inode *parent, struct inode *child) { struct ext4_encryption_context ctx; - int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + struct ext4_crypt_info *ci; + int res; + + res = ext4_get_encryption_info(parent); + if (res < 0) + return res; + ci = EXT4_I(parent)->i_crypt_info; + BUG_ON(ci == NULL); - if (res != sizeof(ctx)) { - if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, - EXT4_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - goto out; - } + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE); } get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: if (!res) ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); return res; + } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9f44f842f74b9e..fd511907f9c1ae 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -112,7 +112,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; - struct ext4_fname_crypto_ctx *enc_ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { @@ -136,16 +135,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); - if (IS_ERR(enc_ctx)) - return PTR_ERR(enc_ctx); - if (enc_ctx) { - err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + err = ext4_setup_fname_crypto(inode); + if (err) + return err; + if (ext4_encrypted_inode(inode)) { + err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); - if (err < 0) { - ext4_put_fname_crypto_ctx(&enc_ctx); + if (err < 0) return err; - } } offset = ctx->pos & (sb->s_blocksize - 1); @@ -241,8 +238,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (enc_ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), @@ -252,7 +248,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int save_len = fname_crypto_str.len; /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(enc_ctx, + err = ext4_fname_disk_to_usr(inode, NULL, de, &fname_crypto_str); fname_crypto_str.len = save_len; if (err < 0) @@ -277,7 +273,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&enc_ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif brelse(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 68259da3ac74c9..163b70c0a89f91 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -960,7 +960,7 @@ struct ext4_inode_info { #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Encryption params */ - struct ext4_crypt_info i_crypt_info; + struct ext4_crypt_info *i_crypt_info; #endif }; @@ -2094,37 +2094,30 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) /* crypto_fname.c */ bool ext4_valid_filenames_enc_mode(uint32_t mode); u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str); -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname); -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen); #ifdef CONFIG_EXT4_FS_ENCRYPTION -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len); +int ext4_setup_fname_crypto(struct inode *inode); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); void ext4_fname_free_filename(struct ext4_filename *fname); #else static inline -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } -static inline -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len) +int ext4_setup_fname_crypto(struct inode *inode) { - return NULL; + return 0; } static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } static inline int ext4_fname_setup_filename(struct inode *dir, @@ -2141,15 +2134,34 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } /* crypto_key.c */ -int ext4_generate_encryption_key(struct inode *inode); +void ext4_free_encryption_info(struct inode *inode); +int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_has_encryption_key(struct inode *inode); + +static inline int ext4_get_encryption_info(struct inode *inode) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _ext4_get_encryption_info(inode); + return 0; +} + #else static inline int ext4_has_encryption_key(struct inode *inode) { return 0; } +static inline int ext4_get_encryption_info(struct inode *inode) +{ + return 0; +} #endif diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index deecbe8968d1e9..d29687c232bd4a 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -76,7 +76,13 @@ struct ext4_encryption_key { struct ext4_crypt_info { unsigned char ci_mode; unsigned char ci_size; + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; char ci_raw[EXT4_MAX_KEY_SIZE]; + char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 @@ -128,16 +134,6 @@ struct ext4_str { u32 len; }; -struct ext4_fname_crypto_ctx { - u32 lim; - struct crypto_ablkcipher *ctfm; - struct crypto_hash *htfm; - struct ext4_crypt_info ci; - unsigned flags : 8; - unsigned has_valid_key : 1; - unsigned ctfm_key_is_ready : 1; -}; - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 009fc128172dfa..3ee44b753175f6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -203,7 +203,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_mapping->host; if (ext4_encrypted_inode(inode)) { - int err = ext4_generate_encryption_key(inode); + int err = ext4_get_encryption_info(inode); if (err) return 0; } @@ -264,7 +264,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } ret = dquot_file_open(inode, filp); if (!ret && ext4_encrypted_inode(inode)) { - ret = ext4_generate_encryption_key(inode); + ret = ext4_get_encryption_info(inode); if (ret) ret = -EACCES; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d63f58cec86b9f..9d35bb36aed7b3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -608,17 +608,14 @@ static struct stats dx_show_leaf(struct inode *dir, char *name; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_fname_crypto_ctx *ctx = NULL; int res; name = de->name; len = de->name_len; - ctx = ext4_get_fname_crypto_ctx(dir, - EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - printk(KERN_WARNING "Error acquiring" - " crypto ctxt--skipping crypto\n"); - ctx = NULL; + res = ext4_setup_fname_crypto(dir); + if (res) { + printk(KERN_WARNING "Error setting up" + " fname crypto: %d\n", res); } if (ctx == NULL) { /* Directory is not encrypted */ @@ -638,7 +635,6 @@ static struct stats dx_show_leaf(struct inode *dir, "allocating crypto " "buffer--skipping " "crypto\n"); - ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } res = ext4_fname_disk_to_usr(ctx, NULL, de, @@ -659,7 +655,6 @@ static struct stats dx_show_leaf(struct inode *dir, printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer( &fname_crypto_str); } @@ -945,7 +940,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", @@ -960,17 +954,15 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); + err = ext4_setup_fname_crypto(dir); + if (err) { brelse(bh); return err; } - if (ctx != NULL) { - err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + if (ext4_encrypted_inode(dir)) { + err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); brelse(bh); return err; } @@ -991,8 +983,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if (ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, @@ -1002,7 +993,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, int save_len = fname_crypto_str.len; /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, hinfo, de, + err = ext4_fname_disk_to_usr(dir, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1022,7 +1013,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, errout: brelse(bh); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif return count; @@ -3110,7 +3100,6 @@ static int ext4_symlink(struct inode *dir, } if (encryption_required) { - struct ext4_fname_crypto_ctx *ctx = NULL; struct qstr istr; struct ext4_str ostr; @@ -3122,19 +3111,14 @@ static int ext4_symlink(struct inode *dir, err = ext4_inherit_context(dir, inode); if (err) goto err_drop_inode; - ctx = ext4_get_fname_crypto_ctx(inode, - inode->i_sb->s_blocksize); - if (IS_ERR_OR_NULL(ctx)) { - /* We just set the policy, so ctx should not be NULL */ - err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); + err = ext4_setup_fname_crypto(inode); + if (err) goto err_drop_inode; - } istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; ostr.len = disk_link.len; - err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); - ext4_put_fname_crypto_ctx(&ctx); + err = ext4_fname_usr_to_disk(inode, &istr, &ostr); if (err < 0) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ecf54cceb1d5f4..21ba06c7ed5f61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -904,9 +904,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_crypt_info.ci_mode = EXT4_ENCRYPTION_MODE_INVALID; + ei->i_crypt_info = NULL; #endif - return &ei->vfs_inode; } @@ -984,6 +983,10 @@ void ext4_clear_inode(struct inode *inode) jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (EXT4_I(inode)->i_crypt_info) + ext4_free_encryption_info(inode); +#endif } static struct inode *ext4_nfs_get_inode(struct super_block *sb, diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index e853eabd9ed7fb..fd00a3654f0496 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -30,7 +30,6 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; struct inode *inode = dentry->d_inode; - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; @@ -39,19 +38,17 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ext4_encrypted_inode(inode)) return page_follow_link_light(dentry, nd); - ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); - if (IS_ERR(ctx)) - return ctx; + res = ext4_setup_fname_crypto(inode); + if (res) + return ERR_PTR(res); if (ext4_inode_is_fast_symlink(inode)) { caddr = (char *) EXT4_I(dentry->d_inode)->i_data; max_size = sizeof(EXT4_I(dentry->d_inode)->i_data); } else { cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) { - ext4_put_fname_crypto_ctx(&ctx); + if (IS_ERR(cpage)) return cpage; - } caddr = kmap(cpage); caddr[size] = 0; } @@ -76,21 +73,19 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) } pstr.name = paddr; pstr.len = plen; - res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); + res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; nd_set_link(nd, paddr); - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); } return NULL; errout: - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); From 4397d281b773914caaa3ecf3a1dfdba1446b912a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:36 -0400 Subject: [PATCH 340/420] ext4: clean up superblock encryption mode fields The superblock fields s_file_encryption_mode and s_dir_encryption_mode are vestigal, so remove them as a cleanup. While we're at it, allow file systems with both encryption and inline_data enabled at the same time to work correctly. We can't have encrypted inodes with inline data, but there's no reason to prohibit unencrypted inodes from using the inline data feature. Change-Id: I3875d3db9790cec7b3a756a997f8fb0d86c9d623 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_policy.c | 9 +++++++-- fs/ext4/ext4.h | 6 ------ fs/ext4/ialloc.c | 19 ------------------- fs/ext4/super.c | 5 ----- 4 files changed, 7 insertions(+), 32 deletions(-) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 370d3aa0a9cf95..683391f790d61b 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -51,6 +51,10 @@ static int ext4_create_encryption_context_from_policy( struct ext4_encryption_context ctx; int res = 0; + res = ext4_convert_inline_data(inode); + if (res) + return res; + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); @@ -199,8 +203,9 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); - if (!res) + if (!res) { ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + } return res; - } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 163b70c0a89f91..b7d0a1a488a249 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1373,12 +1373,6 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; - -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Encryption */ - uint32_t s_file_encryption_mode; - uint32_t s_dir_encryption_mode; -#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 12571b47b75342..9754c0f5a5ff6a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1035,28 +1035,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && - (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { - ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_set_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } else { - /* Inline data and encryption are incompatible - * We turn off inline data since encryption is enabled */ - ei->i_inline_off = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } -#else ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -#endif ret = inode; err = dquot_alloc_inode(inode); if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 21ba06c7ed5f61..f9d34dee0704a2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3454,11 +3454,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Modes of operations for file and directory encryption. */ - sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; -#endif /* Cleanup superblock name */ for (cp = sb->s_id; (cp = strchr(cp, '/'));) From 540ff9e130a22de344db162e5ca50feca6a8fcf5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:36 -0400 Subject: [PATCH 341/420] ext4 crypto: use slab caches Use slab caches the ext4_crypto_ctx and ext4_crypt_info structures for slighly better memory efficiency and debuggability. Change-Id: Ia820c8daae53fefd185d5f0d757d4279c78676d8 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 60 +++++++++++++++++++++----------------------- fs/ext4/crypto_key.c | 12 ++++++--- fs/ext4/ext4.h | 1 + 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 3a25aa4f3d9457..1c34f0eb125b5b 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -55,6 +55,9 @@ static mempool_t *ext4_bounce_page_pool; static LIST_HEAD(ext4_free_crypto_ctxs); static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); +static struct kmem_cache *ext4_crypto_ctx_cachep; +struct kmem_cache *ext4_crypt_info_cachep; + /** * ext4_release_crypto_ctx() - Releases an encryption context * @ctx: The encryption context to release. @@ -79,7 +82,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { if (ctx->tfm) crypto_free_tfm(ctx->tfm); - kfree(ctx); + kmem_cache_free(ext4_crypto_ctx_cachep, ctx); } else { spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); list_add(&ctx->free_list, &ext4_free_crypto_ctxs); @@ -87,23 +90,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) } } -/** - * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context - * @mask: The allocation mask. - * - * Return: An allocated and initialized encryption context on success. An error - * value or NULL otherwise. - */ -static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) -{ - struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), - mask); - - if (!ctx) - return ERR_PTR(-ENOMEM); - return ctx; -} - /** * ext4_get_crypto_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto @@ -121,8 +107,6 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; BUG_ON(ci == NULL); - if (!ext4_read_workqueue) - ext4_init_crypto(); /* * We first try getting the ctx from a free list because in @@ -141,9 +125,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); if (!ctx) { - ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto out; } ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; @@ -217,7 +201,7 @@ void ext4_exit_crypto(void) } if (pos->tfm) crypto_free_tfm(pos->tfm); - kfree(pos); + kmem_cache_free(ext4_crypto_ctx_cachep, pos); } INIT_LIST_HEAD(&ext4_free_crypto_ctxs); if (ext4_bounce_page_pool) @@ -226,6 +210,12 @@ void ext4_exit_crypto(void) if (ext4_read_workqueue) destroy_workqueue(ext4_read_workqueue); ext4_read_workqueue = NULL; + if (ext4_crypto_ctx_cachep) + kmem_cache_destroy(ext4_crypto_ctx_cachep); + ext4_crypto_ctx_cachep = NULL; + if (ext4_crypt_info_cachep) + kmem_cache_destroy(ext4_crypt_info_cachep); + ext4_crypt_info_cachep = NULL; } /** @@ -238,23 +228,31 @@ void ext4_exit_crypto(void) */ int ext4_init_crypto(void) { - int i, res; + int i, res = -ENOMEM; mutex_lock(&crypto_init); if (ext4_read_workqueue) goto already_initialized; ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); - if (!ext4_read_workqueue) { - res = -ENOMEM; + if (!ext4_read_workqueue) + goto fail; + + ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypto_ctx_cachep) + goto fail; + + ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypt_info_cachep) goto fail; - } for (i = 0; i < num_prealloc_crypto_ctxs; i++) { struct ext4_crypto_ctx *ctx; - ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto fail; } list_add(&ctx->free_list, &ext4_free_crypto_ctxs); diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 0075e43ffea61e..d6abe4687cd5b2 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -96,7 +96,7 @@ void ext4_free_encryption_info(struct inode *inode) key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); memzero_explicit(&ci->ci_raw, sizeof(ci->ci_raw)); - kfree(ci); + kmem_cache_free(ext4_crypt_info_cachep, ci); ei->i_crypt_info = NULL; } @@ -113,6 +113,12 @@ int _ext4_get_encryption_info(struct inode *inode) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int res; + if (!ext4_read_workqueue) { + res = ext4_init_crypto(); + if (res) + return res; + } + if (ei->i_crypt_info) { if (!ei->i_crypt_info->ci_keyring_key || key_validate(ei->i_crypt_info->ci_keyring_key) == 0) @@ -134,7 +140,7 @@ int _ext4_get_encryption_info(struct inode *inode) return -EINVAL; res = 0; - crypt_info = kmalloc(sizeof(struct ext4_crypt_info), GFP_KERNEL); + crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; @@ -188,7 +194,7 @@ int _ext4_get_encryption_info(struct inode *inode) if (res < 0) { if (res == -ENOKEY) res = 0; - kfree(crypt_info); + kmem_cache_free(ext4_crypt_info_cachep, crypt_info); } else { ei->i_crypt_info = crypt_info; crypt_info->ci_keyring_key = keyring_key; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7d0a1a488a249..928025fe8a039c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2057,6 +2057,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy); /* crypto.c */ +extern struct kmem_cache *ext4_crypt_info_cachep; bool ext4_valid_contents_enc_mode(uint32_t mode); uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); extern struct workqueue_struct *ext4_read_workqueue; From a3d4b448ab71b7cb52fbf5679f4f1c8b849133d2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:37 -0400 Subject: [PATCH 342/420] ext4 crypto: get rid of ci_mode from struct ext4_crypt_info The ci_mode field was superfluous, and getting rid of it gets rid of an unused hole in the structure. Change-Id: I317165e6219c164af29047a314a3187e94f20f2e Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 11 +++++------ fs/ext4/crypto_fname.c | 4 ++-- fs/ext4/crypto_key.c | 11 +++++------ fs/ext4/ext4_crypto.h | 1 - 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1c34f0eb125b5b..9969d054cd88c0 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -137,14 +137,13 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) /* Allocate a new Crypto API context if we don't already have * one or if it isn't the right mode. */ - BUG_ON(ci->ci_mode == EXT4_ENCRYPTION_MODE_INVALID); - if (ctx->tfm && (ctx->mode != ci->ci_mode)) { + if (ctx->tfm && (ctx->mode != ci->ci_data_mode)) { crypto_free_tfm(ctx->tfm); ctx->tfm = NULL; ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; } if (!ctx->tfm) { - switch (ci->ci_mode) { + switch (ci->ci_data_mode) { case EXT4_ENCRYPTION_MODE_AES_256_XTS: ctx->tfm = crypto_ablkcipher_tfm( crypto_alloc_ablkcipher("xts(aes)", 0, 0)); @@ -162,9 +161,9 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) ctx->tfm = NULL; goto out; } - ctx->mode = ci->ci_mode; + ctx->mode = ci->ci_data_mode; } - BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_mode)); + BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_data_mode)); /* There shouldn't be a bounce page attached to the crypto * context at this point. */ @@ -321,7 +320,7 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, int res = 0; BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_crypt_info->ci_mode); + BUG_ON(ctx->mode != ei->i_crypt_info->ci_data_mode); if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { printk_ratelimited(KERN_ERR diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 374d0e790315e8..e63dd294d7aad5 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -272,9 +272,9 @@ int ext4_setup_fname_crypto(struct inode *inode) if (!ci || ci->ci_ctfm) return 0; - if (ci->ci_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + if (ci->ci_filename_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", - ci->ci_mode); + ci->ci_filename_mode); return -ENOKEY; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index d6abe4687cd5b2..858d7d67a4e19c 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -152,14 +152,13 @@ int _ext4_get_encryption_info(struct inode *inode) memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) - crypt_info->ci_mode = ctx.contents_encryption_mode; + crypt_info->ci_size = + ext4_encryption_key_size(crypt_info->ci_data_mode); else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_info->ci_mode = ctx.filenames_encryption_mode; - else { - printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + crypt_info->ci_size = + ext4_encryption_key_size(crypt_info->ci_filename_mode); + else BUG(); - } - crypt_info->ci_size = ext4_encryption_key_size(crypt_info->ci_mode); BUG_ON(!crypt_info->ci_size); if (DUMMY_ENCRYPTION_ENABLED(sbi)) { memset(crypt_info->ci_raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index d29687c232bd4a..69faf0e9f87460 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -74,7 +74,6 @@ struct ext4_encryption_key { } __attribute__((__packed__)); struct ext4_crypt_info { - unsigned char ci_mode; unsigned char ci_size; char ci_data_mode; char ci_filename_mode; From 411d696381575ec38363b37cc50c0f2863d2f926 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:37 -0400 Subject: [PATCH 343/420] ext4 crypto: shrink size of the ext4_crypto_ctx structure Some fields are only used when the crypto_ctx is being used on the read path, some are only used on the write path, and some are only used when the structure is on free list. Optimize memory use by using a union. Change-Id: I2bf0e784331440ee544f66d32d71c7c55de9d466 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 31 ++++++++++--------------------- fs/ext4/ext4_crypto.h | 21 ++++++++++++++------- fs/ext4/page-io.c | 2 +- fs/ext4/readpage.c | 10 +++++----- 4 files changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 9969d054cd88c0..28a0e4bd91b0d0 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -71,14 +71,14 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) { unsigned long flags; - if (ctx->bounce_page) { + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) { if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) - __free_page(ctx->bounce_page); + __free_page(ctx->w.bounce_page); else - mempool_free(ctx->bounce_page, ext4_bounce_page_pool); - ctx->bounce_page = NULL; + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); } - ctx->control_page = NULL; + ctx->w.bounce_page = NULL; + ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { if (ctx->tfm) crypto_free_tfm(ctx->tfm); @@ -134,6 +134,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) } else { ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; } + ctx->flags &= ~EXT4_WRITE_PATH_FL; /* Allocate a new Crypto API context if we don't already have * one or if it isn't the right mode. */ @@ -165,10 +166,6 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) } BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_data_mode)); - /* There shouldn't be a bounce page attached to the crypto - * context at this point. */ - BUG_ON(ctx->bounce_page); - out: if (res) { if (!IS_ERR_OR_NULL(ctx)) @@ -189,15 +186,6 @@ void ext4_exit_crypto(void) struct ext4_crypto_ctx *pos, *n; list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { - if (pos->bounce_page) { - if (pos->flags & - EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { - __free_page(pos->bounce_page); - } else { - mempool_free(pos->bounce_page, - ext4_bounce_page_pool); - } - } if (pos->tfm) crypto_free_tfm(pos->tfm); kmem_cache_free(ext4_crypto_ctx_cachep, pos); @@ -425,8 +413,9 @@ struct page *ext4_encrypt(struct inode *inode, } else { ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; } - ctx->bounce_page = ciphertext_page; - ctx->control_page = plaintext_page; + ctx->flags |= EXT4_WRITE_PATH_FL; + ctx->w.bounce_page = ciphertext_page; + ctx->w.control_page = plaintext_page; err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { @@ -505,7 +494,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) } else { ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; } - ctx->bounce_page = ciphertext_page; + ctx->w.bounce_page = ciphertext_page; while (len--) { err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 69faf0e9f87460..c5258f24221549 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -86,16 +86,23 @@ struct ext4_crypt_info { #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 +#define EXT4_WRITE_PATH_FL 0x00000004 struct ext4_crypto_ctx { struct crypto_tfm *tfm; /* Crypto API context */ - struct page *bounce_page; /* Ciphertext page on write path */ - struct page *control_page; /* Original page on write path */ - struct bio *bio; /* The bio for this context */ - struct work_struct work; /* Work queue for read complete path */ - struct list_head free_list; /* Free list */ - int flags; /* Flags */ - int mode; /* Encryption mode for tfm */ + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ + char mode; /* Encryption mode for tfm */ }; struct ext4_completion_result { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index da3e4dff941744..8890eec9c4bdca 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -87,7 +87,7 @@ static void ext4_finish_bio(struct bio *bio) /* The bounce data pages are unmapped. */ data_page = page; ctx = (struct ext4_crypto_ctx *)page_private(data_page); - page = ctx->control_page; + page = ctx->w.control_page; } #endif diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 171b9ac4b45e94..ec3ef93a52dbbc 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -54,8 +54,8 @@ static void completion_pages(struct work_struct *work) { #ifdef CONFIG_EXT4_FS_ENCRYPTION struct ext4_crypto_ctx *ctx = - container_of(work, struct ext4_crypto_ctx, work); - struct bio *bio = ctx->bio; + container_of(work, struct ext4_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -109,9 +109,9 @@ static void mpage_end_io(struct bio *bio, int err) if (err) { ext4_release_crypto_ctx(ctx); } else { - INIT_WORK(&ctx->work, completion_pages); - ctx->bio = bio; - queue_work(ext4_read_workqueue, &ctx->work); + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(ext4_read_workqueue, &ctx->r.work); return; } } From 8228d9ec595c7dbb657d19dfadbfa25fbf6b1813 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:37 -0400 Subject: [PATCH 344/420] ext4 crypto: require CONFIG_CRYPTO_CTR if ext4 encryption is enabled On arm64 this is apparently needed for CTS mode to function correctly. Otherwise attempts to use CTS return ENOENT. Change-Id: Ie361427374128113112931ecaa78832583bf91f2 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 024f2284d3f6c0..bf8bc8aba471c6 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -72,6 +72,7 @@ config EXT4_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_CTR select CRYPTO_SHA256 select KEYS select ENCRYPTED_KEYS From b9ebae6dd910d238c35081fbe0290dca3306b6ba Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:37 -0400 Subject: [PATCH 345/420] ext4 crypto: use per-inode tfm structure As suggested by Herbert Xu, we shouldn't allocate a new tfm each time we read or write a page. Instead we can use a single tfm hanging off the inode's crypt_info structure for all of our encryption needs for that inode, since the tfm can be used by multiple crypto requests in parallel. Also use cmpxchg() to avoid races that could result in crypt_info structure getting doubly allocated or doubly freed. Change-Id: I514719b383e00792ae5acaf7ad166a50ae253715 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 64 ++---------------------- fs/ext4/crypto_fname.c | 48 +----------------- fs/ext4/crypto_key.c | 108 +++++++++++++++++++++++++++++------------ fs/ext4/dir.c | 3 -- fs/ext4/ext4.h | 5 +- fs/ext4/ext4_crypto.h | 3 -- fs/ext4/namei.c | 17 ++++--- fs/ext4/super.c | 2 +- fs/ext4/symlink.c | 2 +- 9 files changed, 96 insertions(+), 156 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 28a0e4bd91b0d0..c3a9b08309db19 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -80,8 +80,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) ctx->w.bounce_page = NULL; ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { - if (ctx->tfm) - crypto_free_tfm(ctx->tfm); kmem_cache_free(ext4_crypto_ctx_cachep, ctx); } else { spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); @@ -136,36 +134,6 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) } ctx->flags &= ~EXT4_WRITE_PATH_FL; - /* Allocate a new Crypto API context if we don't already have - * one or if it isn't the right mode. */ - if (ctx->tfm && (ctx->mode != ci->ci_data_mode)) { - crypto_free_tfm(ctx->tfm); - ctx->tfm = NULL; - ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; - } - if (!ctx->tfm) { - switch (ci->ci_data_mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - ctx->tfm = crypto_ablkcipher_tfm( - crypto_alloc_ablkcipher("xts(aes)", 0, 0)); - break; - case EXT4_ENCRYPTION_MODE_AES_256_GCM: - /* TODO(mhalcrow): AEAD w/ gcm(aes); - * crypto_aead_setauthsize() */ - ctx->tfm = ERR_PTR(-ENOTSUPP); - break; - default: - BUG(); - } - if (IS_ERR_OR_NULL(ctx->tfm)) { - res = PTR_ERR(ctx->tfm); - ctx->tfm = NULL; - goto out; - } - ctx->mode = ci->ci_data_mode; - } - BUG_ON(ci->ci_size != ext4_encryption_key_size(ci->ci_data_mode)); - out: if (res) { if (!IS_ERR_OR_NULL(ctx)) @@ -185,11 +153,8 @@ void ext4_exit_crypto(void) { struct ext4_crypto_ctx *pos, *n; - list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { - if (pos->tfm) - crypto_free_tfm(pos->tfm); + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) kmem_cache_free(ext4_crypto_ctx_cachep, pos); - } INIT_LIST_HEAD(&ext4_free_crypto_ctxs); if (ext4_bounce_page_pool) mempool_destroy(ext4_bounce_page_pool); @@ -303,32 +268,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist dst, src; - struct ext4_inode_info *ei = EXT4_I(inode); - struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; - BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_crypt_info->ci_data_mode); - - if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { - printk_ratelimited(KERN_ERR - "%s: unsupported crypto algorithm: %d\n", - __func__, ctx->mode); - return -ENOTSUPP; - } - - crypto_ablkcipher_clear_flags(atfm, ~0); - crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - - res = crypto_ablkcipher_setkey(atfm, ei->i_crypt_info->ci_raw, - ei->i_crypt_info->ci_size); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_setkey() failed\n", - __func__); - return res; - } - req = ablkcipher_request_alloc(atfm, GFP_NOFS); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index e63dd294d7aad5..29a2dc9a6f824f 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -252,52 +252,6 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -int ext4_setup_fname_crypto(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *ci = ei->i_crypt_info; - struct crypto_ablkcipher *ctfm; - int res; - - /* Check if the crypto policy is set on the inode */ - res = ext4_encrypted_inode(inode); - if (res == 0) - return 0; - - res = ext4_get_encryption_info(inode); - if (res < 0) - return res; - ci = ei->i_crypt_info; - - if (!ci || ci->ci_ctfm) - return 0; - - if (ci->ci_filename_mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { - printk_once(KERN_WARNING "ext4: unsupported key mode %d\n", - ci->ci_filename_mode); - return -ENOKEY; - } - - ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", - __func__, res); - return res; - } - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - - res = crypto_ablkcipher_setkey(ctfm, ci->ci_raw, ci->ci_size); - if (res) { - crypto_free_ablkcipher(ctfm); - return -EIO; - } - ci->ci_ctfm = ctfm; - return 0; -} - /** * ext4_fname_crypto_round_up() - * @@ -449,7 +403,7 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; goto out; } - ret = ext4_setup_fname_crypto(dir); + ret = ext4_get_encryption_info(dir); if (ret) return ret; ci = EXT4_I(dir)->i_crypt_info; diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 858d7d67a4e19c..442d24e8efc0ce 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -84,20 +84,32 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], return res; } -void ext4_free_encryption_info(struct inode *inode) +void ext4_free_crypt_info(struct ext4_crypt_info *ci) { - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *ci = ei->i_crypt_info; - if (!ci) return; if (ci->ci_keyring_key) key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); - memzero_explicit(&ci->ci_raw, sizeof(ci->ci_raw)); kmem_cache_free(ext4_crypt_info_cachep, ci); - ei->i_crypt_info = NULL; +} + +void ext4_free_encryption_info(struct inode *inode, + struct ext4_crypt_info *ci) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(ei->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&ei->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + ext4_free_crypt_info(ci); } int _ext4_get_encryption_info(struct inode *inode) @@ -111,6 +123,10 @@ int _ext4_get_encryption_info(struct inode *inode) struct ext4_encryption_context ctx; struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[EXT4_MAX_KEY_SIZE]; + char mode; int res; if (!ext4_read_workqueue) { @@ -119,11 +135,14 @@ int _ext4_get_encryption_info(struct inode *inode) return res; } - if (ei->i_crypt_info) { - if (!ei->i_crypt_info->ci_keyring_key || - key_validate(ei->i_crypt_info->ci_keyring_key) == 0) +retry: + crypt_info = ACCESS_ONCE(ei->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) return 0; - ext4_free_encryption_info(inode); + ext4_free_encryption_info(inode, crypt_info); + goto retry; } res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, @@ -144,26 +163,37 @@ int _ext4_get_encryption_info(struct inode *inode) if (!crypt_info) return -ENOMEM; - ei->i_crypt_policy_flags = ctx.flags; crypt_info->ci_flags = ctx.flags; crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) - crypt_info->ci_size = - ext4_encryption_key_size(crypt_info->ci_data_mode); + mode = crypt_info->ci_data_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_info->ci_size = - ext4_encryption_key_size(crypt_info->ci_filename_mode); + mode = crypt_info->ci_filename_mode; else BUG(); - BUG_ON(!crypt_info->ci_size); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(crypt_info->ci_raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "ext4: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; goto out; } + if (DUMMY_ENCRYPTION_ENABLED(sbi)) { + memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto got_key; + } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, EXT4_KEY_DESC_PREFIX_SIZE); sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, @@ -177,6 +207,7 @@ int _ext4_get_encryption_info(struct inode *inode) keyring_key = NULL; goto out; } + crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); ukp = ((struct user_key_payload *)keyring_key->payload.data); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { @@ -188,19 +219,36 @@ int _ext4_get_encryption_info(struct inode *inode) EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, - crypt_info->ci_raw); -out: - if (res < 0) { - if (res == -ENOKEY) - res = 0; - kmem_cache_free(ext4_crypt_info_cachep, crypt_info); - } else { - ei->i_crypt_info = crypt_info; - crypt_info->ci_keyring_key = keyring_key; - keyring_key = NULL; + raw_key); +got_key: + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); + if (res) + goto out; + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { + ext4_free_crypt_info(crypt_info); + goto retry; } - if (keyring_key) - key_put(keyring_key); + return 0; + +out: + if (res == -ENOKEY) + res = 0; + ext4_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); return res; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index fd511907f9c1ae..c32668599031fc 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -135,9 +135,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - err = ext4_setup_fname_crypto(inode); - if (err) - return err; if (ext4_encrypted_inode(inode)) { err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 928025fe8a039c..b38dc235dfb3c1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -920,7 +920,6 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; - char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; @@ -2103,7 +2102,6 @@ int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname); #ifdef CONFIG_EXT4_FS_ENCRYPTION -int ext4_setup_fname_crypto(struct inode *inode); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); @@ -2129,7 +2127,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } /* crypto_key.c */ -void ext4_free_encryption_info(struct inode *inode); +void ext4_free_crypt_info(struct ext4_crypt_info *ci); +void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index c5258f24221549..34e0d245588102 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -74,13 +74,11 @@ struct ext4_encryption_key { } __attribute__((__packed__)); struct ext4_crypt_info { - unsigned char ci_size; char ci_data_mode; char ci_filename_mode; char ci_flags; struct crypto_ablkcipher *ci_ctfm; struct key *ci_keyring_key; - char ci_raw[EXT4_MAX_KEY_SIZE]; char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; @@ -89,7 +87,6 @@ struct ext4_crypt_info { #define EXT4_WRITE_PATH_FL 0x00000004 struct ext4_crypto_ctx { - struct crypto_tfm *tfm; /* Crypto API context */ union { struct { struct page *bounce_page; /* Ciphertext page */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9d35bb36aed7b3..031e587e5f86c0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -608,11 +608,12 @@ static struct stats dx_show_leaf(struct inode *dir, char *name; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - int res; + int res = 0; name = de->name; len = de->name_len; - res = ext4_setup_fname_crypto(dir); + if (ext4_encrypted_inode(inode)) + res = ext4_get_encryption_info(dir); if (res) { printk(KERN_WARNING "Error setting up" " fname crypto: %d\n", res); @@ -954,12 +955,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ - err = ext4_setup_fname_crypto(dir); - if (err) { - brelse(bh); - return err; - } if (ext4_encrypted_inode(dir)) { + err = ext4_get_encryption_info(dir); + if (err < 0) { + brelse(bh); + return err; + } err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { @@ -3111,7 +3112,7 @@ static int ext4_symlink(struct inode *dir, err = ext4_inherit_context(dir, inode); if (err) goto err_drop_inode; - err = ext4_setup_fname_crypto(inode); + err = ext4_get_encryption_info(inode); if (err) goto err_drop_inode; istr.name = (const unsigned char *) symname; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f9d34dee0704a2..c7fdcc40fef321 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -985,7 +985,7 @@ void ext4_clear_inode(struct inode *inode) } #ifdef CONFIG_EXT4_FS_ENCRYPTION if (EXT4_I(inode)->i_crypt_info) - ext4_free_encryption_info(inode); + ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); #endif } diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index fd00a3654f0496..4f68e83cd25d19 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -38,7 +38,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ext4_encrypted_inode(inode)) return page_follow_link_light(dentry, nd); - res = ext4_setup_fname_crypto(inode); + res = ext4_get_encryption_info(inode); if (res) return ERR_PTR(res); From 266910c486f0a90c53a99c223df71da9099a0631 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:38 -0400 Subject: [PATCH 346/420] ext4 crypto: fix memory leaks in ext4_encrypted_zeroout ext4_encrypted_zeroout() could end up leaking a bio and bounce page. Fortunately it's not used much. While we're fixing things up, refactor out common code into the static function alloc_bounce_page() and fix up error handling if mempool_alloc() fails. Change-Id: I606ad4b775146eb0f29660809df421e9b0ab0039 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 62 ++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index c3a9b08309db19..1c9a8c499369f4 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -314,6 +314,26 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, return 0; } +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +{ + struct page *ciphertext_page = alloc_page(GFP_NOFS); + + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (ciphertext_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags |= EXT4_WRITE_PATH_FL; + ctx->w.bounce_page = ciphertext_page; + return ciphertext_page; +} + /** * ext4_encrypt() - Encrypts a page * @inode: The inode for which the encryption should take place @@ -343,28 +363,17 @@ struct page *ext4_encrypt(struct inode *inode, return (struct page *) ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags |= EXT4_WRITE_PATH_FL; - ctx->w.bounce_page = ciphertext_page; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto errout; ctx->w.control_page = plaintext_page; err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { + ciphertext_page = ERR_PTR(err); + errout: ext4_release_crypto_ctx(ctx); - return ERR_PTR(err); + return ciphertext_page; } SetPagePrivate(ciphertext_page); set_page_private(ciphertext_page, (unsigned long)ctx); @@ -424,21 +433,11 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; } - ctx->w.bounce_page = ciphertext_page; while (len--) { err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, @@ -460,6 +459,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } err = submit_bio_wait(WRITE, bio); + bio_put(bio); if (err) goto errout; } From e9690a4e87d66457ebd51d24f009bdbd871594ea Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:38 -0400 Subject: [PATCH 347/420] ext4 crypto: set up encryption info for new inodes in ext4_inherit_context() Set up the encryption information for newly created inodes immediately after they inherit their encryption context from their parent directories. Change-Id: Iebf4a9de77afca40db946bbb2cb559f6cb3f4c41 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_policy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 683391f790d61b..81980a158dc71e 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -206,6 +206,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) if (!res) { ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + res = ext4_get_encryption_info(child); } return res; } From 87b116c376736cfa6c07ca6d6a6fb9a28229b0df Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:38 -0400 Subject: [PATCH 348/420] ext4 crypto: make sure the encryption info is initialized on opendir(2) Change-Id: I6a5624537daf5940ad48775612096cdd2a78f8c4 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index c32668599031fc..2d3e900aa894ef 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -595,6 +595,13 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) return 0; } +static int ext4_dir_open(struct inode * inode, struct file * filp) +{ + if (ext4_encrypted_inode(inode)) + return ext4_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) @@ -637,5 +644,6 @@ const struct file_operations ext4_dir_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, + .open = ext4_dir_open, .release = ext4_release_dir, }; From 6ace6844843dfe67021d84a9247a1224450b975c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:38 -0400 Subject: [PATCH 349/420] ext4 crypto: encrypt tmpfile located in encryption protected directory Factor out calls to ext4_inherit_context() and move them to __ext4_new_inode(); this fixes a problem where ext4_tmpfile() wasn't calling calling ext4_inherit_context(), so the temporary file wasn't getting protected. Since the blocks for the tmpfile could end up on disk, they really should be protected if the tmpfile is created within the context of an encrypted directory. Change-Id: I367815239a5301e2fe703de62f3fdb29ee0e2fbc Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 9 +++++++++ fs/ext4/ialloc.c | 26 ++++++++++++++++++++------ fs/ext4/namei.c | 29 +---------------------------- 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b38dc235dfb3c1..20f40ccea43f10 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2147,6 +2147,11 @@ static inline int ext4_get_encryption_info(struct inode *inode) return 0; } +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return EXT4_I(inode)->i_crypt_info; +} + #else static inline int ext4_has_encryption_key(struct inode *inode) { @@ -2156,6 +2161,10 @@ static inline int ext4_get_encryption_info(struct inode *inode) { return 0; } +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return NULL; +} #endif diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9754c0f5a5ff6a..25fb2a52ad2eff 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -727,11 +727,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; + int encrypt = 0; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); + if ((ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + err = ext4_get_encryption_info(dir); + if (err) + return ERR_PTR(err); + if (ext4_encryption_info(dir) == NULL) + return ERR_PTR(-EPERM); + if (!handle) + nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); + encrypt = 1; + } + sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -997,12 +1011,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei->i_block_group = group; ei->i_last_alloc_group = ~0; - /* If the directory encrypted, then we should encrypt the inode. */ - if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && - (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(sbi))) - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); @@ -1064,6 +1072,12 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei->i_datasync_tid = handle->h_transaction->t_tid; } + if (encrypt) { + err = ext4_inherit_context(dir, inode); + if (err) + goto fail_free_drop; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 031e587e5f86c0..39fc5cd47b77e8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2439,20 +2439,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (!err && (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { - err = ext4_inherit_context(dir, inode); - if (err) { - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - } - } -#endif - if (!err) - err = ext4_add_nondir(handle, dentry, inode); + err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); } @@ -2633,14 +2620,6 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { - err = ext4_inherit_context(dir, inode); - if (err) - goto out_clear_inode; - } -#endif err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -3109,12 +3088,6 @@ static int ext4_symlink(struct inode *dir, err = -ENOMEM; goto err_drop_inode; } - err = ext4_inherit_context(dir, inode); - if (err) - goto err_drop_inode; - err = ext4_get_encryption_info(inode); - if (err) - goto err_drop_inode; istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; From 9aa70fa937082deb105af0f3363f258cac1cf868 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:39 -0400 Subject: [PATCH 350/420] ext4 crypto: enforce crypto policy restrictions on cross-renames Thanks to Chao Yu for pointing out the need for this check. Change-Id: Ic510e8fb267fb852700f4594b57bbb66d0966815 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 39fc5cd47b77e8..3f5028dec110cb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3644,6 +3644,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) || + ext4_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!ext4_is_child_context_consistent_with_parent(new_dir, + old.inode) || + !ext4_is_child_context_consistent_with_parent(old_dir, + new.inode))) + return -EPERM; + dquot_initialize(old.dir); dquot_initialize(new.dir); From 904be63423edc14f33cf339677613dd432a53921 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:39 -0400 Subject: [PATCH 351/420] ext4 crypto: policies may only be set on directories Thanks to Chao Yu for pointing out we were missing this check. Change-Id: Ic947578c72d457ae026124e328c981db0bc8c949 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_policy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 81980a158dc71e..a1d434d0dea8f8 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -93,6 +93,8 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy, return -EINVAL; if (!ext4_inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; if (!ext4_empty_dir(inode)) return -ENOTEMPTY; return ext4_create_encryption_context_from_policy(inode, From e15a015d77245ac4b25b42ee93d0d007c474fe79 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:39 -0400 Subject: [PATCH 352/420] ext4 crypto: clean up error handling in ext4_fname_setup_filename Fix a potential memory leak where fname->crypto_buf.name wouldn't get freed in some error paths, and also make the error handling easier to understand/audit. Change-Id: I06788ff4a742aacb72eda1bc36371fc23489d55c Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_fname.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 29a2dc9a6f824f..23af41f73e90a6 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -401,7 +401,7 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ((iname->name[1] == '.') && (iname->len == 2))))) { fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; - goto out; + return 0; } ret = ext4_get_encryption_info(dir); if (ret) @@ -411,19 +411,16 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, &fname->crypto_buf); if (ret < 0) - goto out; + return ret; ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) - goto out; + goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; - ret = 0; - goto out; - } - if (!lookup) { - ret = -EACCES; - goto out; + return 0; } + if (!lookup) + return -EACCES; /* We don't have the key and we are doing a lookup; decode the * user-supplied name @@ -431,19 +428,17 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, if (iname->name[0] == '_') bigname = 1; if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) { - ret = -ENOENT; - } + (!bigname && (iname->len > 43))) + return -ENOENT; + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); - if (fname->crypto_buf.name == NULL) { - ret = -ENOMEM; - goto out; - } + if (fname->crypto_buf.name == NULL) + return -ENOMEM; ret = digest_decode(iname->name + bigname, iname->len - bigname, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; - goto out; + goto errout; } fname->crypto_buf.len = ret; if (bigname) { @@ -453,8 +448,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; } - ret = 0; -out: + return 0; +errout: + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; return ret; } From 33891be4482ceabd6b65cc026b8117e87324310c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:40 -0400 Subject: [PATCH 353/420] ext4 crypto: allocate the right amount of memory for the on-disk symlink Previously we were taking the required padding when allocating space for the on-disk symlink. This caused a buffer overrun which could trigger a krenel crash when running fsstress. Change-Id: Ib806649e5c166f6919ea52accd93fb0271356de5 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto_fname.c | 25 +++++++++++++++---------- fs/ext4/ext4.h | 1 + fs/ext4/namei.c | 32 +++++++++++++++++++++----------- 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 23af41f73e90a6..7dc4eb55913c44 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -262,8 +262,20 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) return ((size+blksize-1)/blksize)*blksize; } -/** - * ext4_fname_crypto_alloc_obuff() - +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + int padding = 32; + + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + if (ilen < EXT4_CRYPTO_BLOCK_SIZE) + ilen = EXT4_CRYPTO_BLOCK_SIZE; + return ext4_fname_crypto_round_up(ilen, padding); +} + +/* + * ext4_fname_crypto_alloc_buffer() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. @@ -271,15 +283,8 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str) { - unsigned int olen; - int padding = 16; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + unsigned int olen = ext4_fname_encrypted_size(inode, ilen); - if (ci) - padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); - if (padding < EXT4_CRYPTO_BLOCK_SIZE) - padding = EXT4_CRYPTO_BLOCK_SIZE; - olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 20f40ccea43f10..3f33f98a08e017 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2088,6 +2088,7 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) /* crypto_fname.c */ bool ext4_valid_filenames_enc_mode(uint32_t mode); u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str); int _ext4_fname_disk_to_usr(struct inode *inode, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3f5028dec110cb..3c335850cfb4ec 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3042,10 +3042,23 @@ static int ext4_symlink(struct inode *dir, encryption_required = (ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); - if (encryption_required) - disk_link.len = encrypted_symlink_data_len(len) + 1; - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (encryption_required) { + err = ext4_get_encryption_info(dir); + if (err) + return err; + if (ext4_encryption_info(dir) == NULL) + return -EPERM; + disk_link.len = (ext4_fname_encrypted_size(dir, len) + + sizeof(struct ext4_encrypted_symlink_data)); + sd = kzalloc(disk_link.len, GFP_KERNEL); + if (!sd) + return -ENOMEM; + } + + if (disk_link.len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_free_sd; + } dquot_initialize(dir); @@ -3076,18 +3089,14 @@ static int ext4_symlink(struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - return PTR_ERR(inode); + err = PTR_ERR(inode); + goto err_free_sd; } if (encryption_required) { struct qstr istr; struct ext4_str ostr; - sd = kzalloc(disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_drop_inode; - } istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; @@ -3159,10 +3168,11 @@ static int ext4_symlink(struct inode *dir, err_drop_inode: if (handle) ext4_journal_stop(handle); - kfree(sd); clear_nlink(inode); unlock_new_inode(inode); iput(inode); +err_free_sd: + kfree(sd); return err; } From e2c775826047b16250f260c4702c90c8eab863f9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:40 -0400 Subject: [PATCH 354/420] ext4 crypto: handle unexpected lack of encryption keys Fix up attempts by users to try to write to a file when they don't have access to the encryption key. Change-Id: I598b25bbfea3bc213130749cd39ffe32ba5fe512 Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 3 ++- fs/ext4/crypto_policy.c | 3 ++- fs/ext4/file.c | 17 ++++++++++------- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1c9a8c499369f4..efcb7c04d172ba 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -104,7 +104,8 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) unsigned long flags; struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - BUG_ON(ci == NULL); + if (ci == NULL) + return ERR_PTR(-ENOKEY); /* * We first try getting the ctx from a free list because in diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a1d434d0dea8f8..02c4e5df7afb92 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -183,7 +183,8 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) if (res < 0) return res; ci = EXT4_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); + if (ci == NULL) + return -ENOKEY; ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3ee44b753175f6..ac5bee94810c99 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -206,6 +206,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) int err = ext4_get_encryption_info(inode); if (err) return 0; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; } file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; @@ -253,6 +255,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } + if (ext4_encrypted_inode(inode)) { + ret = ext4_get_encryption_info(inode); + if (ret) + return -EACCES; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -262,13 +271,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } - ret = dquot_file_open(inode, filp); - if (!ret && ext4_encrypted_inode(inode)) { - ret = ext4_get_encryption_info(inode); - if (ret) - ret = -EACCES; - } - return ret; + return dquot_file_open(inode, filp); } /* From 59efbf8e82f354abac9cbaad9f35f39fe9680579 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Sep 2015 16:09:40 -0400 Subject: [PATCH 355/420] ext4 crypto: release crypto resource on module exit Crypto resource should be released when ext4 module exits, otherwise it will cause memory leak. Change-Id: If753b3da3ecb7f974a3c7cea451ebc67e5263e20 Signed-off-by: Chao Yu Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7fdcc40fef321..a22a7d2d3c7d87 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5645,6 +5645,7 @@ static int __init ext4_init_fs(void) static void __exit ext4_exit_fs(void) { + ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); From 3f53a4a4aae6bd9318435f037dbe5533ff1857fb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:09:40 -0400 Subject: [PATCH 356/420] ext4 crypto: allocate bounce pages using GFP_NOWAIT Previously we allocated bounce pages using a combination of alloc_page() and mempool_alloc() with the __GFP_WAIT bit set. Instead, use mempool_alloc() with GFP_NOWAIT. The mempool_alloc() function will try using alloc_pages() initially, and then only use the mempool reserve of pages if alloc_pages() is unable to fulfill the request. This minimizes the the impact on the mm layer when we need to do a large amount of writeback of encrypted files, as Jaeguk Kim had reported that under a heavy fio workload on a system with restricted amounts memory (which unfortunately, includes many mobile handsets), he had observed the the OOM killer getting triggered several times. Using GFP_NOWAIT If the mempool_alloc() function fails, we will retry the page writeback at a later time; the function of the mempool is to ensure that we can writeback at least 32 pages at a time, so we can more efficiently dispatch I/O under high memory pressure situations. In the future we should make this be a tunable so we can determine the best tradeoff between permanently sequestering memory and the ability to quickly launder pages so we can free up memory quickly when necessary. Change-Id: Ib294e81724c73fb5e5a6e704137749437fb6c22a Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" --- fs/ext4/crypto.c | 27 ++++++--------------------- fs/ext4/ext4_crypto.h | 3 +-- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index efcb7c04d172ba..f5c82e8b7b5ce2 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -71,12 +71,8 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) { unsigned long flags; - if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) { - if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) - __free_page(ctx->w.bounce_page); - else - mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); - } + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); ctx->w.bounce_page = NULL; ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { @@ -317,22 +313,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) { - struct page *ciphertext_page = alloc_page(GFP_NOFS); - - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (ciphertext_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); ctx->flags |= EXT4_WRITE_PATH_FL; - ctx->w.bounce_page = ciphertext_page; - return ciphertext_page; + return ctx->w.bounce_page; } /** diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 34e0d245588102..ac7d4e81379630 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -83,8 +83,7 @@ struct ext4_crypt_info { }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 -#define EXT4_WRITE_PATH_FL 0x00000004 +#define EXT4_WRITE_PATH_FL 0x00000002 struct ext4_crypto_ctx { union { From 0149a6c01df42fea2cace9f1f4a802819cfa9136 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Jun 2015 11:54:56 -0400 Subject: [PATCH 357/420] ext4 crypto: fix ext4_get_crypto_ctx()'s calling convention in ext4_decrypt_one Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: I884bcc24af9bc43438f0794a3fb00683837d393f --- fs/ext4/crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index f5c82e8b7b5ce2..45731558138c8e 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -396,8 +396,8 @@ int ext4_decrypt_one(struct inode *inode, struct page *page) struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); ret = ext4_decrypt(ctx, page); ext4_release_crypto_ctx(ctx); return ret; From 0ee332a89305499e14895256fa828be8fa2fc586 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 8 Jun 2015 12:23:21 -0400 Subject: [PATCH 358/420] ext4 crypto: fix sparse warnings in fs/ext4/ioctl.c [ Added another sparse fix for EXT4_IOC_GET_ENCRYPTION_POLICY while we're at it. --tytso ] Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: I56b8943d83eb2a82e0b5c1544df3e70efe8e9525 --- fs/ext4/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 21d077c028fecc..55ae4b8459760b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -682,8 +682,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; } - if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, - 16)) + if (copy_to_user((void __user *) arg, + sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } @@ -697,7 +697,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((void *)arg, &policy, sizeof(policy))) + if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; #else From 210115198e5044c6fdeafe5644186f71c098b7c4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 12 Jun 2015 23:44:33 -0400 Subject: [PATCH 359/420] ext4 crypto: fail the mount if blocksize != pagesize We currently don't correctly handle the case where blocksize != pagesize, so disallow the mount in those cases. Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: Iee838b937f6ef3cc52727e7cda33528066801eec --- fs/ext4/super.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a22a7d2d3c7d87..8131de2c0ac0a0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4049,7 +4049,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + (blocksize != PAGE_CACHE_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Unsupported blocksize for fs encryption"); + goto failed_mount_wq; + } + + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); From 22f0f379fedae625f0b0c117df4ff32c5fdefb57 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Jul 2015 11:16:47 -0400 Subject: [PATCH 360/420] ext4 crypto: use a jbd2 transaction when adding a crypto policy Start a jbd2 transaction, and mark the inode dirty on the inode under that transaction after setting the encrypt flag. Otherwise if the directory isn't modified after setting the crypto policy, the encrypted flag might not survive the inode getting pushed out from memory, or the the file system getting unmounted and remounted. Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: I1f3092b0b34424c74a87c32b117277941ac3ae9f --- fs/ext4/crypto_policy.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 02c4e5df7afb92..a640ec2c4b134a 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -12,6 +12,7 @@ #include #include +#include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -49,7 +50,8 @@ static int ext4_create_encryption_context_from_policy( struct inode *inode, const struct ext4_encryption_policy *policy) { struct ext4_encryption_context ctx; - int res = 0; + handle_t *handle; + int res, res2; res = ext4_convert_inline_data(inode); if (res) @@ -78,11 +80,22 @@ static int ext4_create_encryption_context_from_policy( BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + handle = ext4_journal_start(inode, EXT4_HT_MISC, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); - if (!res) + if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + if (!res) + res = res2; return res; } From 2d219c80cbe2b9cc7337e6bc65ee989fdc711e9d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Jul 2015 11:33:16 -0400 Subject: [PATCH 361/420] ext4 crypto: check for too-short encrypted file names An encrypted file name should never be shorter than an 16 bytes, the AES block size. The 3.10 crypto layer will oops and crash the kernel if ciphertext shorter than the block size is passed to it. Fortunately, in modern kernels the crypto layer will not crash the kernel in this scenario, but nevertheless, it represents a corrupted directory, and we should detect it and mark the file system as corrupted so that e2fsck can fix this. Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: I09616a5b79c1adfbd273dbcf93a8aeda9aaf053a --- fs/ext4/crypto_fname.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 7dc4eb55913c44..86ee996a2bd4b2 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -329,6 +329,10 @@ int _ext4_fname_disk_to_usr(struct inode *inode, return oname->len; } } + if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) { + EXT4_ERROR_INODE(inode, "encrypted inode too small"); + return -EUCLEAN; + } if (EXT4_I(inode)->i_crypt_info) return ext4_fname_decrypt(inode, iname, oname); From 276f45182557dfe3292e64a1f9c170ba402c3926 Mon Sep 17 00:00:00 2001 From: Laurent Navet Date: Wed, 22 Jul 2015 00:08:08 -0400 Subject: [PATCH 362/420] ext4 crypto: exit cleanly if ext4_derive_key_aes() fails Return value of ext4_derive_key_aes() is stored but not used. Add test to exit cleanly if ext4_derive_key_aes() fail. Also fix coverity CID 1309760. Signed-off-by: Laurent Navet Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: I741ae219615188d9ddd395f8e71baae05f64fef4 --- fs/ext4/crypto_key.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 442d24e8efc0ce..ce75bc8b9aefef 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -220,6 +220,8 @@ int _ext4_get_encryption_info(struct inode *inode) BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + if (res) + goto out; got_key: ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { From 6923a24bda76f2f68edc85d422ee0b9645bb5eaf Mon Sep 17 00:00:00 2001 From: Laurent Navet Date: Wed, 22 Jul 2015 00:09:45 -0400 Subject: [PATCH 363/420] ext4 crypto: fix spelling typo in comment Signed-off-by: Laurent Navet Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: Idb99b6a48d03fc01f50e29babf0a677825a9248e --- fs/ext4/crypto_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index ce75bc8b9aefef..1d510c11b100cf 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -30,7 +30,7 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) /** * ext4_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. + * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. * @derived_key: Derived key. * From 88ae994c7d9542c73b9248c777f139c16a5abc37 Mon Sep 17 00:00:00 2001 From: "zilong.liu" Date: Tue, 28 Jul 2015 15:12:18 -0400 Subject: [PATCH 364/420] ext4 crypto: remove duplicate header file Remove key.h which is included twice in crypto_fname.c Signed-off-by: zilong.liu Signed-off-by: Theodore Ts'o Signed-off-by: "Theodore Ts'o" Change-Id: Icd5080785ab46ee35df8148f6555e8323f1574da --- fs/ext4/crypto_fname.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 86ee996a2bd4b2..847f919c84d9cc 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include From c70f019a9a509e4e4a440eddaf179f5b010825fd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Sep 2015 16:28:06 -0400 Subject: [PATCH 365/420] ext4: disable zeroout optimization when encryption is enabled When we are manipulate a sparse file's extent tree, there is an optimization where we will explicitly zero out some data blocks instead of making the extent tree overly bushy. Unfortunately, this can sometimes lead to the file system's extent tree getting corrupted. Disabling the optimization works around this bug, but we don't have a completely understood root cause yet. So this should be understood as only a (hopefully!) temporary workaround. Signed-off-by: "Theodore Ts'o" Signed-off-by: "Theodore Ts'o" Change-Id: Ie237434603ad1e36351b3e42954ffecbd5c0f035 --- fs/ext4/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e45b650d70753d..ae083e6d460004 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3542,7 +3542,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - if (EXT4_EXT_MAY_ZEROOUT & split_flag) + if ((EXT4_EXT_MAY_ZEROOUT & split_flag) && + !ext4_encrypted_inode(inode)) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); From b6d0abf94f767c1a22823bbe7542b2d3d9ae4c7d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 2 Oct 2015 23:54:58 -0400 Subject: [PATCH 366/420] ext4 crypto: fix memory leak in ext4_bio_write_page() There are times when ext4_bio_write_page() is called even though we don't actually need to do any I/O. This happens when ext4_writepage() gets called by the jbd2 commit path when an inode needs to force its pages written out in order to provide data=ordered guarantees --- and a page is backed by an unwritten (e.g., uninitialized) block on disk, or if delayed allocation means the page's backing store hasn't been allocated yet. In that case, we need to skip the call to ext4_encrypt_page(), since in addition to wasting CPU, it leads to a bounce page and an ext4 crypto context getting leaked. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o Change-Id: I1951c737742420b10c80eb616abb92af467d80d9 --- fs/ext4/page-io.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 8890eec9c4bdca..8521b9b925765b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -429,6 +429,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; + int nr_to_submit = 0; blocksize = 1 << inode->i_blkbits; @@ -481,11 +482,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } set_buffer_async_write(bh); + nr_to_submit++; } while ((bh = bh->b_this_page) != head); bh = head = page_buffers(page); - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && + nr_to_submit) { data_page = ext4_encrypt(inode, page); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); From dbcded0499c27cc645ed4cd3927a71382e112beb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Oct 2015 10:49:23 -0400 Subject: [PATCH 367/420] ext4: optimize ext4_writepage() for attempted 4k delalloc writes In cases where the file system block size is the same as the page size, and ext4_writepage() is asked to write out a page which is either has the unwritten bit set in the extent tree, or which does not yet have a block assigned due to delayed allocation, we can bail out early and, unlocking the page earlier and avoiding a round trip through ext4_bio_write_page() with the attendant calls to set_page_writeback() and redirty_page_for_writeback(). Signed-off-by: Theodore Ts'o Signed-off-by: Theodore Ts'o Change-Id: Icf8cb0bc37df0f5bdfbd0464dd7acf95fa6fe337 --- fs/ext4/inode.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 93250f40c77c64..4fd2a2af4d96ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1809,11 +1809,22 @@ static int ext4_writepage(struct page *page, * the page. But we may reach here when we do a journal commit via * journal_submit_inode_data_buffers() and in that case we must write * allocated buffers to achieve data=ordered mode guarantees. + * + * Also, if there is only one buffer per page (the fs block + * size == the page size), if one buffer needs block + * allocation or needs to modify the extent tree to clear the + * unwritten flag, we know that the page can't be written at + * all, so we might as well refuse the write immediately. + * Unfortunately if the block size != page size, we can't as + * easily detect this case using ext4_walk_page_buffers(), but + * for the extremely common case, this is an optimization that + * skips a useless round trip through ext4_bio_write_page(). */ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); - if (current->flags & PF_MEMALLOC) { + if ((current->flags & PF_MEMALLOC) || + (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) { /* * For memory cleaning there's no point in writing only * some buffers. So just bail out. Warn if we came here From 2d5cb296651535ce3a009e64145a1f315d86b28b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Oct 2015 10:49:26 -0400 Subject: [PATCH 368/420] ext4 crypto: ext4_page_crypto() doesn't need a encryption context Since ext4_page_crypto() doesn't need an encryption context (at least not any more), this allows us to simplify a number function signature and also allows us to avoid needing to allocate a context in ext4_block_write_begin(). It also means we no longer need a separate ext4_decrypt_one() function. Signed-off-by: Theodore Ts'o Signed-off-by: Theodore Ts'o Change-Id: Ibf9cad24bac3011cd59d66f1070859719c864dc0 --- fs/ext4/crypto.c | 28 +++++----------------------- fs/ext4/ext4.h | 3 +-- fs/ext4/inode.c | 4 ++-- fs/ext4/readpage.c | 2 +- 4 files changed, 9 insertions(+), 28 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 45731558138c8e..3a5a7a2597de51 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -253,8 +253,7 @@ typedef enum { EXT4_ENCRYPT, } ext4_direction_t; -static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, - struct inode *inode, +static int ext4_page_crypto(struct inode *inode, ext4_direction_t rw, pgoff_t index, struct page *src_page, @@ -353,7 +352,7 @@ struct page *ext4_encrypt(struct inode *inode, if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; - err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { ciphertext_page = ERR_PTR(err); @@ -378,31 +377,14 @@ struct page *ext4_encrypt(struct inode *inode, * * Return: Zero on success, non-zero otherwise. */ -int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +int ext4_decrypt(struct page *page) { BUG_ON(!PageLocked(page)); - return ext4_page_crypto(ctx, page->mapping->host, + return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, page->index, page, page); } -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int ext4_decrypt_one(struct inode *inode, struct page *page) -{ - int ret; - - struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = ext4_decrypt(ctx, page); - ext4_release_crypto_ctx(ctx); - return ret; -} - int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) { struct ext4_crypto_ctx *ctx; @@ -426,7 +408,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) } while (len--) { - err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, ZERO_PAGE(0), ciphertext_page); if (err) goto errout; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3f33f98a08e017..c598ccb5658b4a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2065,8 +2065,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); -int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); -int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); #ifdef CONFIG_EXT4_FS_ENCRYPTION diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4fd2a2af4d96ba..36f4f68d1eabab 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -961,7 +961,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = ext4_decrypt_one(inode, page); + err = ext4_decrypt(page); return err; } #endif @@ -3401,7 +3401,7 @@ static int ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!ext4_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_CACHE_SIZE); - WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + WARN_ON_ONCE(ext4_decrypt(page)); } } if (ext4_should_journal_data(inode)) { diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index ec3ef93a52dbbc..377b2355c1d449 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - int ret = ext4_decrypt(ctx, page); + int ret = ext4_decrypt(page); if (ret) { WARN_ON_ONCE(1); SetPageError(page); From 803af5cc613e9c4e4858f80505a29500bcf22008 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Oct 2015 10:49:27 -0400 Subject: [PATCH 369/420] ext4 crypto: replace some BUG_ON()'s with error checks Buggy (or hostile) userspace should not be able to cause the kernel to crash. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o Change-Id: I5f6c811b23cbadefb4e324c4ec0c5838815725dc --- fs/ext4/crypto.c | 1 - fs/ext4/crypto_fname.c | 2 -- fs/ext4/crypto_key.c | 16 +++++++++++++--- fs/ext4/crypto_policy.c | 3 ++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 3a5a7a2597de51..879cb15b7a21fa 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -295,7 +295,6 @@ static int ext4_page_crypto(struct inode *inode, else res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 847f919c84d9cc..2fbef8a14760f4 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 1d510c11b100cf..f9270ec2a1325c 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], EXT4_AES_256_XTS_KEY_SIZE, NULL); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -208,7 +207,12 @@ int _ext4_get_encryption_info(struct inode *inode) goto out; } crypt_info->ci_keyring_key = keyring_key; - BUG_ON(keyring_key->type != &key_type_logon); + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "ext4: key type must be logon\n"); + res = -ENOKEY; + goto out; + } ukp = ((struct user_key_payload *)keyring_key->payload.data); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; @@ -217,7 +221,13 @@ int _ext4_get_encryption_info(struct inode *inode) master_key = (struct ext4_encryption_key *)ukp->data; BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "ext4: key size incorrect: %d\n", + master_key->size); + res = -ENOKEY; + goto out; + } res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); if (res) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a640ec2c4b134a..ad050698143fde 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, if ((parent == NULL) || (child == NULL)) { pr_err("parent %p child %p\n", parent, child); - BUG_ON(1); + WARN_ON(1); /* Should never happen */ + return 0; } /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) From c9e01ad4cc55a90a25fbf9781d1d6c981f5b46a7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Oct 2015 10:49:29 -0400 Subject: [PATCH 370/420] ext4 crypto: fix bugs in ext4_encrypted_zeroout() Fix multiple bugs in ext4_encrypted_zeroout(), including one that could cause us to write an encrypted zero page to the wrong location on disk, potentially causing data and file system corruption. Fortunately, this tends to only show up in stress tests, but even with these fixes, we are seeing some test failures with generic/127 --- but these are now caused by data failures instead of metadata corruption. Since ext4_encrypted_zeroout() is only used for some optimizations to keep the extent tree from being too fragmented, and ext4_encrypted_zeroout() itself isn't all that optimized from a time or IOPS perspective, disable the extent tree optimization for encrypted inodes for now. This prevents the data corruption issues reported by generic/127 until we can figure out what's going wrong. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o Change-Id: If679914474690f0bb02c5930d9f2e8b4cb44a40f --- fs/ext4/crypto.c | 24 ++++++++++++++++++++---- fs/ext4/extents.c | 3 +++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 879cb15b7a21fa..d1ec7147890790 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -392,7 +392,13 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) ext4_lblk_t lblk = ex->ee_block; ext4_fsblk_t pblk = ext4_ext_pblock(ex); unsigned int len = ext4_ext_get_actual_len(ex); - int err = 0; + int ret, err = 0; + +#if 0 + ext4_msg(inode->i_sb, KERN_CRIT, + "ext4_encrypted_zeroout ino %lu lblk %u len %u", + (unsigned long) inode->i_ino, lblk, len); +#endif BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); @@ -418,17 +424,27 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = pblk; - err = bio_add_page(bio, ciphertext_page, + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); - if (err) { + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + ext4_msg(inode->i_sb, KERN_ERR, + "bio_add_page failed: %d", ret); + WARN_ON(1); bio_put(bio); + err = -EIO; goto errout; } err = submit_bio_wait(WRITE, bio); + err = submit_bio_wait(WRITE, bio); + if ((err == 0) && !test_bit(BIO_UPTODATE, &bio->bi_flags)) + err = -EIO; bio_put(bio); if (err) goto errout; + lblk++; pblk++; } err = 0; errout: diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ae083e6d460004..76e9afd8f6804e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3547,6 +3547,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); + if (ext4_encrypted_inode(inode)) + max_zeroout = 0; + /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); From 33de0d0b36817df956620e89cb5aaf9e61ac8bcc Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Wed, 11 May 2016 13:18:35 -0700 Subject: [PATCH 371/420] Don't show empty tag stats for unprivileged uids BUG: 27577101 BUG: 27532522 Signed-off-by: Mohamad Ayyash --- net/netfilter/xt_qtaguid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index ea2dd21e3adaf1..169bda196a4596 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1945,7 +1945,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) ); f_count = atomic_long_read( &sock_tag_entry->socket->file->f_count); - seq_printf(m, "sock=%p tag=0x%llx (uid=%u) pid=%u " + seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " "f_count=%lu\n", sock_tag_entry->sk, sock_tag_entry->tag, uid, @@ -2548,8 +2548,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, uid_t stat_uid = get_uid_from_tag(tag); struct proc_print_info *ppi = m->private; /* Detailed tags are not available to everybody */ - if (get_atag_from_tag(tag) && !can_read_other_uid_stats( - make_kuid(&init_user_ns,stat_uid))) { + if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) { CT_DEBUG("qtaguid: stats line: " "%s 0x%llx %u: insufficient priv " "from pid=%u tgid=%u uid=%u stats.gid=%u\n", From 1f6320d0587e312c45c7336f28e59d211bf60b35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Aug 2016 05:56:26 -0700 Subject: [PATCH 372/420] UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit bb1fceca22492109be12640d49f5ea5a544c6bb4) When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the tail of the write queue using tcp_add_write_queue_tail() Then it attempts to copy user data into this fresh skb. If the copy fails, we undo the work and remove the fresh skb. Unfortunately, this undo lacks the change done to tp->highest_sack and we can leave a dangling pointer (to a freed skb) Later, tcp_xmit_retransmit_queue() can dereference this pointer and access freed memory. For regular kernels where memory is not unmapped, this might cause SACK bugs because tcp_highest_sack_seq() is buggy, returning garbage instead of tp->snd_nxt, but with various debug features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel. This bug was found by Marco Grassi thanks to syzkaller. Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb") Reported-by: Marco Grassi Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Reviewed-by: Cong Wang Signed-off-by: David S. Miller Change-Id: I58bb02d6e4e399612e8580b9e02d11e661df82f5 Bug: 31183296 --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index d431be36263be8..15757f9a7e07af 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1434,6 +1434,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (sk->sk_send_head == skb_unlinked) sk->sk_send_head = NULL; + if (tcp_sk(sk)->highest_sack == skb_unlinked) + tcp_sk(sk)->highest_sack = NULL; } static inline void tcp_init_send_head(struct sock *sk) From 87893933b507e6e5cebe5fbe849bbccc31afd393 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 12 Dec 2014 01:56:04 +0200 Subject: [PATCH 373/420] UPSTREAM: x86/uaccess: fix sparse errors virtio wants to read bitwise types from userspace using get_user. At the moment this triggers sparse errors, since the value is passed through an integer. Fix that up using __force. Signed-off-by: Michael S. Tsirkin Acked-by: Thomas Gleixner Change-Id: Ife0f1635cea5afb006dc57e99e6dd616b6ae8c34 (cherry picked from commit e182c570e9953859aee5cb016583217d9e68ea18) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0d592e0a5b84fa..ace9dec050b17b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) asm volatile("call __get_user_%P3" \ : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ - (x) = (__typeof__(*(ptr))) __val_gu; \ + (x) = (__force __typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) From 2ad54df2af25588888594c9a13a462dc3a73b8cb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:49 -0700 Subject: [PATCH 374/420] UPSTREAM: x86/uaccess: Tell the compiler that uaccess is unlikely to fault GCC doesn't realize that get_user(), put_user(), and their __ variants are unlikely to fail. Tell it. I noticed this while playing with the C entry code. Before: text data bss dec filename 21828763 5194760 1277952 28301475 vmlinux.baseline After: text data bss dec filename 21828379 5194760 1277952 28301091 vmlinux.new The generated code shrunk by 384 bytes. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/dc37bed7024319c3004d950d57151fca6aeacf97.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar Change-Id: Ifc6955457dbe95df679931453598e4ca9a98b126 (cherry picked from commit a76cf66e948afbaeda8e3ecc861f29c47a026c27) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ace9dec050b17b..a7997564fbd1a0 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -180,7 +180,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ - __ret_gu; \ + __builtin_expect(__ret_gu, 0); \ }) #define __put_user_x(size, x, ptr, __ret_pu) \ @@ -275,7 +275,7 @@ extern void __put_user_8(void); __put_user_x(X, __pu_val, ptr, __ret_pu); \ break; \ } \ - __ret_pu; \ + __builtin_expect(__ret_pu, 0); \ }) #define __put_user_size(x, ptr, size, retval, errret) \ @@ -398,7 +398,7 @@ do { \ ({ \ int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ - __pu_err; \ + __builtin_expect(__pu_err, 0); \ }) #define __get_user_nocheck(x, ptr, size) \ @@ -407,7 +407,7 @@ do { \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __gu_err; \ + __builtin_expect(__gu_err, 0); \ }) /* FIXME: this hack is definitely wrong -AK */ From fe281eb0ad9d06ebed7f288b9db255cf4d5812d2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:45:09 -0800 Subject: [PATCH 375/420] UPSTREAM: x86: reorganize SMAP handling in user space accesses This reorganizes how we do the stac/clac instructions in the user access code. Instead of adding the instructions directly to the same inline asm that does the actual user level access and exception handling, add them at a higher level. This is mainly preparation for the next step, where we will expose an interface to allow users to mark several accesses together as being user space accesses, but it does already clean up some code: - the inlined trivial cases of copy_in_user() now do stac/clac just once over the accesses: they used to do one pair around the user space read, and another pair around the write-back. - the {get,put}_user_ex() macros that are used with the catch/try handling don't do any stac/clac at all, because that happens in the try/catch surrounding them. Other than those two cleanups that happened naturally from the re-organization, this should not make any difference. Yet. Signed-off-by: Linus Torvalds Change-Id: Iaad8756bc8e95876e2e2a7d7bbd333fc478ab441 (cherry picked from commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 53 +++++++++++------ arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a7997564fbd1a0..602007a151374e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -133,6 +133,9 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); +#define __uaccess_begin() stac() +#define __uaccess_end() clac() + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -191,10 +194,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -205,10 +208,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -301,6 +304,10 @@ do { \ } \ } while (0) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __put_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -355,9 +362,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %2,%"rtype"1\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -367,6 +374,10 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __get_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -397,7 +408,9 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __uaccess_begin(); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __uaccess_end(); \ __builtin_expect(__pu_err, 0); \ }) @@ -405,7 +418,9 @@ do { \ ({ \ int __gu_err; \ unsigned long __gu_val; \ + __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ }) @@ -420,9 +435,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %"rtype"1,%2\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -442,11 +457,11 @@ struct __large_struct { unsigned long buf[100]; }; */ #define uaccess_try do { \ current_thread_info()->uaccess_err = 0; \ - stac(); \ + __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ - clac(); \ + __uaccess_end(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ } while (0) @@ -542,12 +557,13 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ + __uaccess_begin(); \ switch (size) { \ case 1: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -561,9 +577,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 2: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -577,9 +593,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 4: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -596,9 +612,9 @@ extern void __cmpxchg_wrong_size(void) if (!IS_ENABLED(CONFIG_X86_64)) \ __cmpxchg_wrong_size(); \ \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -613,6 +629,7 @@ extern void __cmpxchg_wrong_size(void) default: \ __cmpxchg_wrong_size(); \ } \ + __uaccess_end(); \ *__uval = __old; \ __ret; \ }) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 12a26b979bf163..62727aa9696c02 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + case 1: + __uaccess_begin(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); + __uaccess_end(); return ret; - case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + case 2: + __uaccess_begin(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; - case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + case 4: + __uaccess_begin(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); + __uaccess_end(); return ret; - case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + case 8: + __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); + if (likely(!ret)) + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); + if (likely(!ret)) + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + __uaccess_end(); return ret; default: return copy_user_generic(dst, (__force void *)src, size); @@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + case 1: + __uaccess_begin(); + __put_user_asm(*(u8 *)src, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; - case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + case 2: + __uaccess_begin(); + __put_user_asm(*(u16 *)src, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; - case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + case 4: + __uaccess_begin(); + __put_user_asm(*(u32 *)src, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; - case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + case 8: + __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 10); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + } + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 16); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "er", 8); + } + __uaccess_end(); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) switch (size) { case 1: { u8 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u8 __user *)src, ret, "b", "b", "=q", 1); if (likely(!ret)) __put_user_asm(tmp, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; } case 2: { u16 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u16 __user *)src, ret, "w", "w", "=r", 2); if (likely(!ret)) __put_user_asm(tmp, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; } case 4: { u32 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u32 __user *)src, ret, "l", "k", "=r", 4); if (likely(!ret)) __put_user_asm(tmp, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; } case 8: { u64 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u64 __user *)src, ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; } default: From a67130cc29005b23f988648e44f464c7b71fc87b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:57:27 -0800 Subject: [PATCH 376/420] BACKPORT: Add 'unsafe' user access functions for batched accesses The naming is meant to discourage random use: the helper functions are not really any more "unsafe" than the traditional double-underscore functions (which need the address range checking), but they do need even more infrastructure around them, and should not be used willy-nilly. In addition to checking the access range, these user access functions require that you wrap the user access with a "user_acess_{begin,end}()" around it. That allows architectures that implement kernel user access control (x86: SMAP, arm64: PAN) to do the user access control in the wrapping user_access_begin/end part, and then batch up the actual user space accesses using the new interfaces. The main (and hopefully only) use for these are for core generic access helpers, initially just the generic user string functions (strnlen_user() and strncpy_from_user()). Signed-off-by: Linus Torvalds Change-Id: Ic64efea41f97171bdbdabe3e531489aebd9b6fac (cherry picked from commit 5b24a7a2aa2040c8c50c3b71122901d01661ff78) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 25 +++++++++++++++++++++++++ include/linux/uaccess.h | 7 +++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 602007a151374e..2d620d27aa4504 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -757,5 +757,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #undef __copy_from_user_overflow #undef __copy_to_user_overflow +/* + * The "unsafe" user accesses aren't really "unsafe", but the naming + * is a big fat warning: you have to not only do the access_ok() + * checking before using them, but you have to surround them with the + * user_access_begin/end() pair. + */ +#define user_access_begin() __uaccess_begin() +#define user_access_end() __uaccess_end() + +#define unsafe_put_user(x, ptr) \ +({ \ + int __pu_err; \ + __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __builtin_expect(__pu_err, 0); \ +}) + +#define unsafe_get_user(x, ptr) \ +({ \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ +}) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ecd3319dac3314..870d8d06095abf 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -107,4 +107,11 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size); extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); +#ifndef user_access_begin +#define user_access_begin() do { } while (0) +#define user_access_end() do { } while (0) +#define unsafe_get_user(x, ptr) __get_user(x, ptr) +#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#endif + #endif /* __LINUX_UACCESS_H__ */ From 5244fa145d94b508b20e428142cc963b45a11c11 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 10:05:19 -0800 Subject: [PATCH 377/420] UPSTREAM: Use the new batched user accesses in generic user string handling This converts the generic user string functions to use the batched user access functions. It makes a big difference on Skylake, which is the first x86 microarchitecture to implement SMAP. The STAC/CLAC instructions are not very fast, and doing them for each access inside the loop that copies strings from user space (which is what the pathname handling does for every pathname the kernel uses, for example) is very inefficient. Signed-off-by: Linus Torvalds Change-Id: Ic39a686b4bb1ad9cd16ad0887bb669beed6fe8aa (cherry picked from commit 9fd4470ff4974c41b1db43c3b355b9085af9c12a) Signed-off-by: Sami Tolvanen --- lib/strncpy_from_user.c | 11 ++++++++--- lib/strnlen_user.c | 18 ++++++++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index bb2b201d6ad039..f2d09719967dfa 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -38,7 +38,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) break; *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { @@ -54,7 +54,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long while (max) { char c; - if (unlikely(__get_user(c,src+res))) + if (unlikely(unsafe_get_user(c,src+res))) return -EFAULT; dst[res] = c; if (!c) @@ -106,7 +106,12 @@ long strncpy_from_user(char *dst, const char __user *src, long count) src_addr = (unsigned long)src; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strncpy_from_user(dst, src, count, max); + long retval; + + user_access_begin(); + retval = do_strncpy_from_user(dst, src, count, max); + user_access_end(); + return retval; } return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index a28df5206d95c2..781881bd347526 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,7 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(__get_user(c,(unsigned long __user *)src))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) return 0; c |= aligned_byte_mask(align); @@ -60,7 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max < sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) return 0; } res -= align; @@ -103,7 +103,12 @@ long strnlen_user(const char __user *str, long count) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, count, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, count, max); + user_access_end(); + return retval; } return 0; } @@ -131,7 +136,12 @@ long strlen_user(const char __user *str) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, ~0ul, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, ~0ul, max); + user_access_end(); + return retval; } return 0; } From 9ce5e97007036c10976f9e85fedde180ebe311d0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 21:28:47 +0100 Subject: [PATCH 378/420] BACKPORT: ARM: 8583/1: mm: fix location of _etext The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. Just to be conservative, leave the kernel resource as it was, using __init_begin instead of _etext as the end mark. Signed-off-by: Kees Cook Signed-off-by: Russell King Change-Id: I6859a93058b7cabf5e5cf80ee57b549368bb3361 (cherry picked from commit 14c4a533e0996f95a0a64dfd0b6252d788cebc74) Signed-off-by: Sami Tolvanen --- arch/arm/kernel/setup.c | 2 +- arch/arm/kernel/vmlinux.lds.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index c03106378b4927..8eaa0870829aca 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -748,7 +748,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 3afcb6c2cf0604..2af0005e2548ef 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -123,6 +123,8 @@ SECTIONS #ifdef CONFIG_DEBUG_RODATA . = ALIGN(1< Date: Thu, 23 Jun 2016 15:53:17 +0200 Subject: [PATCH 379/420] BACKPORT: arm64: mm: fix location of _etext As Kees Cook notes in the ARM counterpart of this patch [0]: The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. In particular, Kees is referring to the HARDENED_USERCOPY patch set [1], which rejects attempts to call copy_to_user() on kernel ranges containing executable code, but does allow access to the .rodata segment. Regardless of whether one may or may not agree with the distinction, it makes sense for _etext to have the same meaning across architectures. So let's put _etext where it belongs, between .text and .rodata, and fix up existing references to use __init_begin instead, which unlike _end_rodata includes the exception and notes sections as well. The _etext references in kaslr.c are left untouched, since its references to [_stext, _etext) are meant to capture potential jump instruction targets, and so disregarding .rodata is actually an improvement here. [0] http://article.gmane.org/gmane.linux.kernel/2245084 [1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502 Reported-by: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas Change-Id: I8f6582525217b9ca324f6a382ea52d30ce1d0dbd (cherry picked from commit 9fdc14c55cd6579d619ccd9d40982e0805e62b6d) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/setup.c | 2 +- arch/arm64/kernel/vmlinux.lds.S | 3 ++- arch/arm64/mm/mmu.c | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index c59764cec6f2b5..68795e0ef2d3c4 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -339,7 +339,7 @@ static void __init request_standard_resources(void) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 9753efa3090ee4..7d476134961096 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -97,11 +97,12 @@ SECTIONS } ALIGN_DEBUG_RO + _etext = .; /* End of text section */ + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES ALIGN_DEBUG_RO - _etext = .; /* End of text and rodata section */ ALIGN_DEBUG_RO_MIN(PAGE_SIZE) __init_begin = .; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e83d30e544ccf4..9ab9573070c113 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -391,7 +391,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end) end - kernel_x_end, PAGE_KERNEL); } - } #else static void __init __map_memblock(phys_addr_t start, phys_addr_t end) @@ -479,9 +478,8 @@ void __init fixup_executable(void) void mark_rodata_ro(void) { create_mapping_late(__pa(_stext), (unsigned long)_stext, - (unsigned long)_etext - (unsigned long)_stext, + (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_EXEC | PTE_RDONLY); - } #endif From 51f6e75698ede422fd183dc3f70ddceb2b9aa43a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: [PATCH 380/420] UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds Change-Id: Ib905a84a04d46984320f6fd1056da4d72f3d6b53 (cherry picked from commit 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 16 ++++++++-------- include/linux/uaccess.h | 4 ++-- lib/strncpy_from_user.c | 8 ++++---- lib/strnlen_user.c | 7 +++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2d620d27aa4504..e76e970f9fe8e1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -766,21 +766,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 870d8d06095abf..5e38c6d4d8eb93 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -110,8 +110,8 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index f2d09719967dfa..d4be5b104da350 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -38,8 +38,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -54,8 +54,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -74,6 +73,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 781881bd347526..bf5582f723eabc 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -60,8 +59,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max < sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -76,6 +74,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } From 6379fa887e91b18617c3e531caf3cac0d2be3871 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 19 Jul 2016 15:00:04 -0700 Subject: [PATCH 381/420] UPSTREAM: mm: Add is_migrate_cma_page Code such as hardened user copy[1] needs a way to tell if a page is CMA or not. Add is_migrate_cma_page in a similar way to is_migrate_isolate_page. [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 Signed-off-by: Laura Abbott Signed-off-by: Kees Cook Change-Id: I1f9aa13d8d063038fa70b93282a836648fbb4f6d (cherry picked from commit 7c15d9bb8231f998ae7dc0b72415f5215459f7fb) Signed-off-by: Sami Tolvanen --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ffe66e381c0423..2594de31a1c7fa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -65,8 +65,10 @@ enum { #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false +# define is_migrate_cma_page(_page) false #endif #define for_each_migratetype_order(order, type) \ From 7ff004499b90388e227d3a5ba774728e273a7155 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jul 2016 16:19:48 -0700 Subject: [PATCH 382/420] BACKPORT: mm: Implement stack frame object validation This creates per-architecture function arch_within_stack_frames() that should validate if a given object is contained by a kernel stack frame. Initial implementation is on x86. This is based on code from PaX. Signed-off-by: Kees Cook Change-Id: I1f3b299bb8991d65dcdac6af85d633d4b7776df1 (cherry picked from commit 0f60a8efe4005ab5e65ce000724b04d4ca04a199) Signed-off-by: Sami Tolvanen --- arch/Kconfig | 9 ++++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++ include/linux/thread_info.h | 9 ++++++ 4 files changed, 63 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 240bda0276b9e1..c484676a9b8b32 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -411,6 +411,15 @@ config CC_STACKPROTECTOR_STRONG endchoice +config HAVE_ARCH_WITHIN_STACK_FRAMES + bool + help + An architecture should select this if it can walk the kernel stack + frames to determine if an object is part of either the arguments + or local variables (i.e. that it excludes saved return addresses, + and similar) by implementing an inline arch_within_stack_frames(), + which is used by CONFIG_HARDENED_USERCOPY. + config HAVE_CONTEXT_TRACKING bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07d6ba9b3828a2..0cf7f4575c163c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,7 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_KRETPROBES diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc60d..97c8fd1d1c7501 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -170,6 +170,50 @@ static inline struct thread_info *current_thread_info(void) return ti; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3c9..5ecb68e869680f 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -145,6 +145,15 @@ static inline bool test_and_clear_restore_sigmask(void) #error "no set_restore_sigmask() provided and default one won't work" #endif +#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ From e79892c74ebe11053a9c934f0ae0e628b5bda530 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 7 Jun 2016 11:05:33 -0700 Subject: [PATCH 383/420] BACKPORT: mm: Hardened usercopy This is the start of porting PAX_USERCOPY into the mainline kernel. This is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The work is based on code by PaX Team and Brad Spengler, and an earlier port from Casey Schaufler. Additional non-slab page tests are from Rik van Riel. This patch contains the logic for validating several conditions when performing copy_to_user() and copy_from_user() on the kernel object being copied to/from: - address range doesn't wrap around - address range isn't NULL or zero-allocated (with a non-zero copy size) - if on the slab allocator: - object size must be less than or equal to copy size (when check is implemented in the allocator, which appear in subsequent patches) - otherwise, object must not span page allocations (excepting Reserved and CMA ranges) - if on the stack - object must not extend before/after the current process stack - object must be contained by a valid stack frame (when there is arch/build support for identifying stack frames) - object must not overlap with kernel text Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Tested-by: Michael Ellerman Change-Id: Iff3b5f1ddb04acd99ccf9a9046c7797363962b2a (cherry picked from commit f5509cc18daa7f82bcc553be70df2117c8eedc16) Signed-off-by: Sami Tolvanen --- include/linux/slab.h | 12 ++ include/linux/thread_info.h | 15 ++ mm/Makefile | 4 + mm/usercopy.c | 269 ++++++++++++++++++++++++++++++++++++ security/Kconfig | 28 ++++ 5 files changed, 328 insertions(+) create mode 100644 mm/usercopy.c diff --git a/include/linux/slab.h b/include/linux/slab.h index c265bec6a57db9..9e14e37aa9e448 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -145,6 +145,18 @@ void kfree(const void *); void kzfree(const void *); size_t ksize(const void *); +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page); +#else +static inline const char *__check_heap_object(const void *ptr, + unsigned long n, + struct page *page) +{ + return NULL; +} +#endif + /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 5ecb68e869680f..0ae29ff9ccfde0 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -154,6 +154,21 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif +#ifdef CONFIG_HARDENED_USERCOPY +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + __check_object_size(ptr, n, to_user); +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/mm/Makefile b/mm/Makefile index 9928fbe3feee59..495a4afdaff89b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -20,6 +20,9 @@ KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n +# Since __builtin_frame_address does work as used, disable the warning. +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -87,3 +90,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 00000000000000..816feccebeb0dc --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,269 @@ +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: fully on the stack (when can't do frame-checking) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * the check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + return GOOD_STACK; +} + +static void report_usercopy(const void *ptr, unsigned long len, + bool to_user, const char *type) +{ + pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", ptr, type ? : "unknown", len); + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const void *ptr, unsigned long n, unsigned long low, + unsigned long high) +{ + unsigned long check_low = (uintptr_t)ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high < low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, + unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + return ""; + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)__va(__pa(textlow)); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return NULL; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)__va(__pa(texthigh)); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + return ""; + + return NULL; +} + +static inline const char *check_bogus_address(const void *ptr, unsigned long n) +{ + /* Reject if object wraps past end of memory. */ + if (ptr + n < ptr) + return ""; + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + return ""; + + return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page, *endpage; + const void *end = ptr + n - 1; + bool is_reserved, is_cma; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* + * Sometimes the kernel data regions are not marked Reserved (see + * check below). And sometimes [_sdata,_edata) does not cover + * rodata and/or bss, so check each range explicitly. + */ + + /* Allow reads of kernel rodata region (if not marked as Reserved). */ + if (ptr >= (const void *)__start_rodata && + end <= (const void *)__end_rodata) { + if (!to_user) + return ""; + return NULL; + } + + /* Allow kernel data region (if not marked as Reserved). */ + if (ptr >= (const void *)_sdata && end <= (const void *)_edata) + return NULL; + + /* Allow kernel bss region (if not marked as Reserved). */ + if (ptr >= (const void *)__bss_start && + end <= (const void *)__bss_stop) + return NULL; + + /* Is the object wholly within one base page? */ + if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == + ((unsigned long)end & (unsigned long)PAGE_MASK))) + return NULL; + + /* Allow if start and end are inside the same compound page. */ + endpage = virt_to_head_page(end); + if (likely(endpage == page)) + return NULL; + + /* + * Reject if range is entirely either Reserved (i.e. special or + * device memory), or CMA. Otherwise, reject since the object spans + * several independently allocated pages. + */ + is_reserved = PageReserved(page); + is_cma = is_migrate_cma_page(page); + if (!is_reserved && !is_cma) + goto reject; + + for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { + page = virt_to_head_page(ptr); + if (is_reserved && !PageReserved(page)) + goto reject; + if (is_cma && !is_migrate_cma_page(page)) + goto reject; + } + + return NULL; + +reject: + return ""; +} + +/* + * Validates that the given object is: + * - not bogus address + * - known-safe heap or stack object + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + const char *err; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + err = check_bogus_address(ptr, n); + if (err) + goto report; + + /* Check for bad heap object. */ + err = check_heap_object(ptr, n, to_user); + if (err) + goto report; + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + err = ""; + goto report; + } + + /* Check for object in kernel to avoid text exposure. */ + err = check_kernel_text_object(ptr, n); + if (!err) + return; + +report: + report_usercopy(ptr, n, to_user, err); +} +EXPORT_SYMBOL(__check_object_size); diff --git a/security/Kconfig b/security/Kconfig index 37e537db577cc8..3412b0ec965e89 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -126,6 +126,34 @@ config LSM_MMAP_MIN_ADDR this low address space will need the permission specific to the systems running LSM. +config HAVE_HARDENED_USERCOPY_ALLOCATOR + bool + help + The heap allocator implements __check_heap_object() for + validating memory ranges against heap object sizes in + support of CONFIG_HARDENED_USERCOPY. + +config HAVE_ARCH_HARDENED_USERCOPY + bool + help + The architecture supports CONFIG_HARDENED_USERCOPY by + calling check_object_size() just before performing the + userspace copies in the low level implementation of + copy_to_user() and copy_from_user(). + +config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_ARCH_HARDENED_USERCOPY + select BUG + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and + copy_from_user() functions) by rejecting memory ranges that + are larger than the specified heap object, span multiple + separately allocates pages, are not on the process stack, + or are part of the kernel text. This kills entire classes + of heap overflow exploits and similar kernel memory exposures. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 3cf2694337412c90f8ca6457c425c96da2221373 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:04:01 -0700 Subject: [PATCH 384/420] BACKPORT: x86/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in copy_*_user() and __copy_*_user() because copy_*_user() actually calls down to _copy_*_user() and not __copy_*_user(). Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Change-Id: I260db1d4572bdd2f779200aca99d03a170658440 (cherry picked from commit 5b710f34e194c6b7710f69fdb5d798fdf35b98c1) Signed-off-by: Sami Tolvanen --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 10 ++++++---- arch/x86/include/asm/uaccess_32.h | 2 ++ arch/x86/include/asm/uaccess_64.h | 2 ++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0cf7f4575c163c..a80a0749dd0c90 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,7 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e76e970f9fe8e1..fe1f8bfc27870b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -726,9 +726,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) * case, and do only runtime checking for non-constant sizes. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_from_user_overflow(); else __copy_from_user_overflow(sz, n); @@ -744,9 +745,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_to_user_overflow(); else __copy_to_user_overflow(sz, n); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3c03a5de64d30c..6990f6ad007956 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -43,6 +43,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); if (__builtin_constant_p(n)) { unsigned long ret; @@ -137,6 +138,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 62727aa9696c02..12496f789ae364 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -53,6 +53,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -117,6 +118,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { From 3552a17aa5e88e89f4af2cc2e3170966bcaef98b Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 19 Aug 2015 11:02:28 +0100 Subject: [PATCH 385/420] BACKPORT: ARM: uaccess: provide uaccess_save_and_enable() and uaccess_restore() Provide uaccess_save_and_enable() and uaccess_restore() to permit control of userspace visibility to the kernel, and hook these into the appropriate places in the kernel where we need to access userspace. Signed-off-by: Russell King Change-Id: I89c68368df268cbefa169e9d647b12c27b281c0f (cherry picked from commit 3fba7e23f754a9a6e639b640fa2a393712ffe1b8) Signed-off-by: Sami Tolvanen --- arch/arm/include/asm/futex.h | 18 +++++++- arch/arm/include/asm/uaccess.h | 71 +++++++++++++++++++++++++++--- arch/arm/kernel/armksyms.c | 6 +-- arch/arm/lib/clear_user.S | 6 +-- arch/arm/lib/copy_from_user.S | 6 +-- arch/arm/lib/copy_to_user.S | 6 +-- arch/arm/lib/uaccess_with_memcpy.c | 4 +- 7 files changed, 96 insertions(+), 21 deletions(-) diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 53e69dae796f32..63139e28857283 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -22,8 +22,11 @@ #ifdef CONFIG_SMP #define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg) \ +({ \ + unsigned int __ua_flags; \ smp_mb(); \ prefetchw(uaddr); \ + __ua_flags = uaccess_save_and_enable(); \ __asm__ __volatile__( \ "1: ldrex %1, [%3]\n" \ " " insn "\n" \ @@ -34,12 +37,15 @@ __futex_atomic_ex_table("%5") \ : "=&r" (ret), "=&r" (oldval), "=&r" (tmp) \ : "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \ - : "cc", "memory") + : "cc", "memory"); \ + uaccess_restore(__ua_flags); \ +}) static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { + unsigned int __ua_flags; int ret; u32 val; @@ -49,6 +55,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, smp_mb(); /* Prefetching cannot fault */ prefetchw(uaddr); + __ua_flags = uaccess_save_and_enable(); __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" "1: ldrex %1, [%4]\n" " teq %1, %2\n" @@ -61,6 +68,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "=&r" (ret), "=&r" (val) : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) : "cc", "memory"); + uaccess_restore(__ua_flags); smp_mb(); *uval = val; @@ -73,6 +81,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, #include #define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg) \ +({ \ + unsigned int __ua_flags = uaccess_save_and_enable(); \ __asm__ __volatile__( \ "1: " TUSER(ldr) " %1, [%3]\n" \ " " insn "\n" \ @@ -81,12 +91,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, __futex_atomic_ex_table("%5") \ : "=&r" (ret), "=&r" (oldval), "=&r" (tmp) \ : "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \ - : "cc", "memory") + : "cc", "memory"); \ + uaccess_restore(__ua_flags); \ +}) static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { + unsigned int __ua_flags; int ret = 0; u32 val; @@ -102,6 +115,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "+r" (ret), "=&r" (val) : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) : "cc", "memory"); + uaccess_restore(__ua_flags); *uval = val; return ret; diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 4767eb9caa78c8..3b27eee6d636ba 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -49,6 +49,21 @@ struct exception_table_entry extern int fixup_exception(struct pt_regs *regs); +/* + * These two functions allow hooking accesses to userspace to increase + * system integrity by ensuring that the kernel can not inadvertantly + * perform such accesses (eg, via list poison values) which could then + * be exploited for priviledge escalation. + */ +static inline unsigned int uaccess_save_and_enable(void) +{ + return 0; +} + +static inline void uaccess_restore(unsigned int flags) +{ +} + /* * These two are intentionally not defined anywhere - if the kernel * code generates any references to them, that's a bug. @@ -165,6 +180,7 @@ extern int __get_user_64t_4(void *); register typeof(x) __r2 asm("r2"); \ register unsigned long __l asm("r1") = __limit; \ register int __e asm("r0"); \ + unsigned int __ua_flags = uaccess_save_and_enable(); \ switch (sizeof(*(__p))) { \ case 1: \ if (sizeof((x)) >= 8) \ @@ -192,6 +208,7 @@ extern int __get_user_64t_4(void *); break; \ default: __e = __get_user_bad(); break; \ } \ + uaccess_restore(__ua_flags); \ x = (typeof(*(p))) __r2; \ __e; \ }) @@ -224,6 +241,7 @@ extern int __put_user_8(void *, unsigned long long); register const typeof(*(p)) __user *__p asm("r0") = __tmp_p; \ register unsigned long __l asm("r1") = __limit; \ register int __e asm("r0"); \ + unsigned int __ua_flags = uaccess_save_and_enable(); \ switch (sizeof(*(__p))) { \ case 1: \ __put_user_x(__r2, __p, __e, __l, 1); \ @@ -239,6 +257,7 @@ extern int __put_user_8(void *, unsigned long long); break; \ default: __e = __put_user_bad(); break; \ } \ + uaccess_restore(__ua_flags); \ __e; \ }) @@ -300,14 +319,17 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_addr = (unsigned long)(ptr); \ unsigned long __gu_val; \ + unsigned int __ua_flags; \ __chk_user_ptr(ptr); \ might_fault(); \ + __ua_flags = uaccess_save_and_enable(); \ switch (sizeof(*(ptr))) { \ case 1: __get_user_asm_byte(__gu_val,__gu_addr,err); break; \ case 2: __get_user_asm_half(__gu_val,__gu_addr,err); break; \ case 4: __get_user_asm_word(__gu_val,__gu_addr,err); break; \ default: (__gu_val) = __get_user_bad(); \ } \ + uaccess_restore(__ua_flags); \ (x) = (__typeof__(*(ptr)))__gu_val; \ } while (0) @@ -381,9 +403,11 @@ do { \ #define __put_user_err(x,ptr,err) \ do { \ unsigned long __pu_addr = (unsigned long)(ptr); \ + unsigned int __ua_flags; \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ might_fault(); \ + __ua_flags = uaccess_save_and_enable(); \ switch (sizeof(*(ptr))) { \ case 1: __put_user_asm_byte(__pu_val,__pu_addr,err); break; \ case 2: __put_user_asm_half(__pu_val,__pu_addr,err); break; \ @@ -391,6 +415,7 @@ do { \ case 8: __put_user_asm_dword(__pu_val,__pu_addr,err); break; \ default: __put_user_bad(); \ } \ + uaccess_restore(__ua_flags); \ } while (0) #define __put_user_asm_byte(x,__pu_addr,err) \ @@ -474,11 +499,46 @@ do { \ #ifdef CONFIG_MMU -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); -extern unsigned long __must_check __copy_to_user_std(void __user *to, const void *from, unsigned long n); -extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); -extern unsigned long __must_check __clear_user_std(void __user *addr, unsigned long n); +extern unsigned long __must_check +arm_copy_from_user(void *to, const void __user *from, unsigned long n); + +static inline unsigned long __must_check +__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + unsigned int __ua_flags = uaccess_save_and_enable(); + n = arm_copy_from_user(to, from, n); + uaccess_restore(__ua_flags); + return n; +} + +extern unsigned long __must_check +arm_copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check +__copy_to_user_std(void __user *to, const void *from, unsigned long n); + +static inline unsigned long __must_check +__copy_to_user(void __user *to, const void *from, unsigned long n) +{ + unsigned int __ua_flags = uaccess_save_and_enable(); + n = arm_copy_to_user(to, from, n); + uaccess_restore(__ua_flags); + return n; +} + +extern unsigned long __must_check +arm_clear_user(void __user *addr, unsigned long n); +extern unsigned long __must_check +__clear_user_std(void __user *addr, unsigned long n); + +static inline unsigned long __must_check +__clear_user(void __user *addr, unsigned long n) +{ + unsigned int __ua_flags = uaccess_save_and_enable(); + n = arm_clear_user(addr, n); + uaccess_restore(__ua_flags); + return n; +} + #else #define __copy_from_user(to,from,n) (memcpy(to, (void __force *)from, n), 0) #define __copy_to_user(to,from,n) (memcpy((void __force *)to, from, n), 0) @@ -511,6 +571,7 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo return n; } +/* These are from lib/ code, and use __get_user() and friends */ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index a88671cfe1ffb1..a35d72d30b56a0 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -91,9 +91,9 @@ EXPORT_SYMBOL(__memzero); #ifdef CONFIG_MMU EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); -EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(arm_copy_from_user); +EXPORT_SYMBOL(arm_copy_to_user); +EXPORT_SYMBOL(arm_clear_user); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S index 14a0d988c82cb4..a6d7bb973bf1f2 100644 --- a/arch/arm/lib/clear_user.S +++ b/arch/arm/lib/clear_user.S @@ -12,14 +12,14 @@ .text -/* Prototype: int __clear_user(void *addr, size_t sz) +/* Prototype: unsigned long arm_clear_user(void *addr, size_t sz) * Purpose : clear some user memory * Params : addr - user memory address to clear * : sz - number of bytes to clear * Returns : number of bytes NOT cleared */ ENTRY(__clear_user_std) -WEAK(__clear_user) +WEAK(arm_clear_user) stmfd sp!, {r1, lr} mov r2, #0 cmp r1, #4 @@ -44,7 +44,7 @@ WEAK(__clear_user) USER( strnebt r2, [r0]) mov r0, #0 ldmfd sp!, {r1, pc} -ENDPROC(__clear_user) +ENDPROC(arm_clear_user) ENDPROC(__clear_user_std) .pushsection .fixup,"ax" diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index 66a477a3e3cc6b..70bf0e90da4dbc 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -16,7 +16,7 @@ /* * Prototype: * - * size_t __copy_from_user(void *to, const void *from, size_t n) + * size_t arm_copy_from_user(void *to, const void *from, size_t n) * * Purpose: * @@ -84,11 +84,11 @@ .text -ENTRY(__copy_from_user) +ENTRY(arm_copy_from_user) #include "copy_template.S" -ENDPROC(__copy_from_user) +ENDPROC(arm_copy_from_user) .pushsection .fixup,"ax" .align 0 diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index d066df686e1787..a610ffaa165d53 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -16,7 +16,7 @@ /* * Prototype: * - * size_t __copy_to_user(void *to, const void *from, size_t n) + * size_t arm_copy_to_user(void *to, const void *from, size_t n) * * Purpose: * @@ -88,11 +88,11 @@ .text ENTRY(__copy_to_user_std) -WEAK(__copy_to_user) +WEAK(arm_copy_to_user) #include "copy_template.S" -ENDPROC(__copy_to_user) +ENDPROC(arm_copy_to_user) ENDPROC(__copy_to_user_std) .pushsection .fixup,"ax" diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index 3e58d710013c3a..77f020e75ccd2b 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -136,7 +136,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) } unsigned long -__copy_to_user(void __user *to, const void *from, unsigned long n) +arm_copy_to_user(void __user *to, const void *from, unsigned long n) { /* * This test is stubbed out of the main function above to keep @@ -190,7 +190,7 @@ __clear_user_memset(void __user *addr, unsigned long n) return n; } -unsigned long __clear_user(void __user *addr, unsigned long n) +unsigned long arm_clear_user(void __user *addr, unsigned long n) { /* See rational for this in __copy_to_user() above. */ if (n < 64) From f76e4f08b91599aa1be370f9c2e35458eb93b4de Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 5 Dec 2015 13:42:07 +0000 Subject: [PATCH 386/420] UPSTREAM: ARM: fix uaccess_with_memcpy() with SW_DOMAIN_PAN The uaccess_with_memcpy() code is currently incompatible with the SW PAN code: it takes locks within the region that we've changed the DACR, potentially sleeping as a result. As we do not save and restore the DACR across co-operative sleep events, can lead to an incorrect DACR value later in this code path. Reported-by: Peter Rosin Tested-by: Peter Rosin Signed-off-by: Russell King Change-Id: Iaa6ef00ad4d084eb24b9941a97ffe201635de003 (cherry picked from commit c014953d84ec21a4df9a43be2378861ea6e9246e) Signed-off-by: Sami Tolvanen --- arch/arm/include/asm/uaccess.h | 4 ++++ arch/arm/lib/uaccess_with_memcpy.c | 29 +++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 3b27eee6d636ba..7132983908bfa8 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -519,10 +519,14 @@ __copy_to_user_std(void __user *to, const void *from, unsigned long n); static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { +#ifndef CONFIG_UACCESS_WITH_MEMCPY unsigned int __ua_flags = uaccess_save_and_enable(); n = arm_copy_to_user(to, from, n); uaccess_restore(__ua_flags); return n; +#else + return arm_copy_to_user(to, from, n); +#endif } extern unsigned long __must_check diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index 77f020e75ccd2b..b3e68086b347e8 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -88,6 +88,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) static unsigned long noinline __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) { + unsigned long ua_flags; int atomic; if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { @@ -118,7 +119,9 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) if (tocopy > n) tocopy = n; + ua_flags = uaccess_save_and_enable(); memcpy((void *)to, from, tocopy); + uaccess_restore(ua_flags); to += tocopy; from += tocopy; n -= tocopy; @@ -145,14 +148,21 @@ arm_copy_to_user(void __user *to, const void *from, unsigned long n) * With frame pointer disabled, tail call optimization kicks in * as well making this test almost invisible. */ - if (n < 64) - return __copy_to_user_std(to, from, n); - return __copy_to_user_memcpy(to, from, n); + if (n < 64) { + unsigned long ua_flags = uaccess_save_and_enable(); + n = __copy_to_user_std(to, from, n); + uaccess_restore(ua_flags); + } else { + n = __copy_to_user_memcpy(to, from, n); + } + return n; } static unsigned long noinline __clear_user_memset(void __user *addr, unsigned long n) { + unsigned long ua_flags; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { memset((void *)addr, 0, n); return 0; @@ -175,7 +185,9 @@ __clear_user_memset(void __user *addr, unsigned long n) if (tocopy > n) tocopy = n; + ua_flags = uaccess_save_and_enable(); memset((void *)addr, 0, tocopy); + uaccess_restore(ua_flags); addr += tocopy; n -= tocopy; @@ -193,9 +205,14 @@ __clear_user_memset(void __user *addr, unsigned long n) unsigned long arm_clear_user(void __user *addr, unsigned long n) { /* See rational for this in __copy_to_user() above. */ - if (n < 64) - return __clear_user_std(addr, n); - return __clear_user_memset(addr, n); + if (n < 64) { + unsigned long ua_flags = uaccess_save_and_enable(); + n = __clear_user_std(addr, n); + uaccess_restore(ua_flags); + } else { + n = __clear_user_memset(addr, n); + } + return n; } #if 0 From f6b6248421a394b45cb2b3465fdb6c87f755c44f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:06:53 -0700 Subject: [PATCH 387/420] BACKPORT: ARM: uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Change-Id: Id0c7cdbf6813313b806a36129db0daedf4a6b6d9 (cherry picked from commit dfd45b6103c973bfcea2341d89e36faf947dbc33) Signed-off-by: Sami Tolvanen --- arch/arm/Kconfig | 1 + arch/arm/include/asm/uaccess.h | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e9464e91455b0e..f0c2201ff61c93 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -30,6 +30,7 @@ config ARM select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT) select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KGDB + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 7132983908bfa8..b0413e87d65339 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -505,7 +505,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n); static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(to, n, false); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_from_user(to, from, n); uaccess_restore(__ua_flags); return n; @@ -520,11 +523,15 @@ static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { #ifndef CONFIG_UACCESS_WITH_MEMCPY - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(from, n, true); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_to_user(to, from, n); uaccess_restore(__ua_flags); return n; #else + check_object_size(from, n, true); return arm_copy_to_user(to, from, n); #endif } From b7af2fef98c8ce99a9e4fd19b242303687a1fa0e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:59:42 -0700 Subject: [PATCH 388/420] BACKPORT: arm64/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm64. As done by KASAN in -next, renames the low-level functions to __arch_copy_*_user() so a static inline can do additional work before the copy. Signed-off-by: Kees Cook Change-Id: I1286cae8e6ffcf12ea54ddd62f1a6d2ce742c8d0 (cherry picked from commit faf5b63e294151d6ac24ca6906d6f221bd3496cd) Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/uaccess.h | 29 ++++++++++++++++++++++------- arch/arm64/kernel/arm64ksyms.c | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f608579ec2588a..7671b67c464954 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -35,6 +35,7 @@ config ARM64 select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 3bf8f4e99a511c..50cf07d6ac2e56 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -236,24 +236,39 @@ do { \ -EFAULT; \ }) -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + check_object_size(to, n, false); + return __arch_copy_from_user(to, from, n); +} + +static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + check_object_size(from, n, true); + return __arch_copy_to_user(to, from, n); +} + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else /* security hole - plug it */ + if (access_ok(VERIFY_READ, from, n)) { + check_object_size(to, n, false); + n = __arch_copy_from_user(to, from, n); + } else /* security hole - plug it */ memset(to, 0, n); return n; } static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + if (access_ok(VERIFY_WRITE, to, n)) { + check_object_size(from, n, true); + n = __arch_copy_to_user(to, from, n); + } return n; } diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index a85843ddbde889..963760eb3243a0 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -33,8 +33,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); /* user mem (segment) */ -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); +EXPORT_SYMBOL(__arch_copy_from_user); +EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_in_user); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 5e27add9d3624e..a8162dd27e9f17 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -27,7 +27,7 @@ * Returns: * x0 - bytes not copied */ -ENTRY(__copy_from_user) +ENTRY(__arch_copy_from_user) add x4, x1, x2 // upper user buffer boundary subs x2, x2, #8 b.mi 2f @@ -52,7 +52,7 @@ USER(9f, ldrb w3, [x1] ) strb w3, [x0] 5: mov x0, #0 ret -ENDPROC(__copy_from_user) +ENDPROC(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index a0aeeb9b7a284a..b5f0ca706924dd 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -27,7 +27,7 @@ * Returns: * x0 - bytes not copied */ -ENTRY(__copy_to_user) +ENTRY(__arch_copy_to_user) add x4, x0, x2 // upper user buffer boundary subs x2, x2, #8 b.mi 2f @@ -52,7 +52,7 @@ USER(9f, strh w3, [x0], #2 ) USER(9f, strb w3, [x0] ) 5: mov x0, #0 ret -ENDPROC(__copy_to_user) +ENDPROC(__arch_copy_to_user) .section .fixup,"ax" .align 2 From e0592bc510a0e908130fb7a929436139640aa91a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:20:59 -0700 Subject: [PATCH 389/420] UPSTREAM: mm: SLAB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLAB allocator to catch any copies that may span objects. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Change-Id: Ib910a71fdc2ab808e1a45b6d33e9bae1681a1f4a (cherry picked from commit 04385fc5e8fffed84425d909a783c0f0c587d847) Signed-off-by: Sami Tolvanen --- init/Kconfig | 1 + mm/slab.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index e9e50041bc358c..1de8fa47457297 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1730,6 +1730,7 @@ choice config SLAB bool "SLAB" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work well in all environments. It organizes cache hot objects in diff --git a/mm/slab.c b/mm/slab.c index f34e053ec46e24..27935574c43339 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4207,6 +4207,36 @@ static int __init slab_proc_init(void) module_init(slab_proc_init); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + /* Find and validate object. */ + cachep = page->slab_cache; + objnr = obj_to_index(cachep, page, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within object size. */ + if (offset <= cachep->object_size && n <= cachep->object_size - offset) + return NULL; + + return cachep->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + /** * ksize - get the actual amount of memory allocated for a given object * @objp: Pointer to the object From d6b0d2cf2082c8f9a154fa8587998e5201157c35 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: [PATCH 390/420] UPSTREAM: mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ib893a17ecabd692e6c402e864196bf89cd6781a5 (cherry picked from commit d86bd1bece6fc41d59253002db5441fe960a37f6) Signed-off-by: Sami Tolvanen --- include/linux/slub_def.h | 1 + mm/slub.c | 100 +++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index d82abd40a3c061..561990306cca1e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,6 +81,7 @@ struct kmem_cache { int reserved; /* Reserved bytes at the end of slabs */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ + int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif diff --git a/mm/slub.c b/mm/slub.c index ae7b9f1ad394ca..1344cd8d689089 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -123,6 +123,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -223,24 +231,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -270,12 +260,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -455,6 +447,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -470,6 +478,26 @@ static int disable_higher_order_debug; /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, @@ -605,7 +633,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -622,9 +652,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -654,6 +684,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -744,11 +777,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -790,6 +823,10 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; @@ -1404,7 +1441,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; out: @@ -3003,7 +3040,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3012,6 +3049,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* From 7f3f5e68ef59ae6c2b692d314eab0549effd42f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:24:05 -0700 Subject: [PATCH 391/420] UPSTREAM: mm: SLUB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLUB allocator to catch any copies that may span objects. Includes a redzone handling fix discovered by Michael Ellerman. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Michael Ellerman Reviwed-by: Laura Abbott Change-Id: I52dc6fb3a3492b937d52b5cf9c046bf03dc40a3a (cherry picked from commit ed18adc1cdd00a5c55a20fbdaed4804660772281) Signed-off-by: Sami Tolvanen --- init/Kconfig | 1 + mm/slub.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 1de8fa47457297..10913ea8d4b758 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1738,6 +1738,7 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). diff --git a/mm/slub.c b/mm/slub.c index 1344cd8d689089..8daf5c53f7ae10 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3343,6 +3343,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *s; + unsigned long offset; + size_t object_size; + + /* Find object and usable object size. */ + s = page->slab_cache; + object_size = slab_ksize(s); + + /* Reject impossible pointers. */ + if (ptr < page_address(page)) + return s->name; + + /* Find offset within object. */ + offset = (ptr - page_address(page)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (offset < s->red_left_pad) + return s->name; + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; + + return s->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + size_t ksize(const void *object) { struct page *page; From ece7d82efe691eb91c385141a5acc03c8ecd10a1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Aug 2016 12:15:22 -0700 Subject: [PATCH 392/420] UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math check_bogus_address() checked for pointer overflow using this expression, where 'ptr' has type 'const void *': ptr + n < ptr Since pointer wraparound is undefined behavior, gcc at -O2 by default treats it like the following, which would not behave as intended: (long)n < 0 Fortunately, this doesn't currently happen for kernel code because kernel code is compiled with -fno-strict-overflow. But the expression should be fixed anyway to use well-defined integer arithmetic, since it could be treated differently by different compilers in the future or could be reported by tools checking for undefined behavior. Signed-off-by: Eric Biggers Signed-off-by: Kees Cook Change-Id: I73b13be651cf35c03482f2014bf2c3dd291518ab (cherry picked from commit 7329a655875a2f4bd6984fe8a7e00a6981e802f3) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 816feccebeb0dc..f5c4c3a6f2df28 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -125,7 +125,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if (ptr + n < ptr) + if ((unsigned long)ptr + n < (unsigned long)ptr) return ""; /* Reject if NULL or ZERO-allocation. */ From e110ad72a154d57f97eb001b5631ad1bf8e32c13 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Aug 2016 11:53:59 -0500 Subject: [PATCH 393/420] UPSTREAM: usercopy: fix overlap check for kernel text When running with a local patch which moves the '_stext' symbol to the very beginning of the kernel text area, I got the following panic with CONFIG_HARDENED_USERCOPY: usercopy: kernel memory exposure attempt detected from ffff88103dfff000 () (4096 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:79! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1 Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016 task: ffff880817444140 task.stack: ffff880816274000 RIP: 0010:[] __check_object_size+0x76/0x413 RSP: 0018:ffff880816277c40 EFLAGS: 00010246 RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8 RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000 R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000 R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001 FS: 00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0 Stack: ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000 0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28 000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6 Call Trace: [] copy_page_to_iter_iovec+0xa6/0x1c0 [] copy_page_to_iter+0x16/0x90 [] generic_file_read_iter+0x3e3/0x7c0 [] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs] [] ? down_read+0x12/0x40 [] xfs_file_buffered_aio_read+0x51/0xc0 [xfs] [] xfs_file_read_iter+0x62/0xb0 [xfs] [] __vfs_read+0xdf/0x130 [] vfs_read+0x8e/0x140 [] SyS_read+0x55/0xc0 [] do_syscall_64+0x67/0x160 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00 RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00 RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004 RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000 R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000 Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3 RIP [] __check_object_size+0x76/0x413 RSP The checked object's range [ffff88103dfff000, ffff88103e000000) is valid, so there shouldn't have been a BUG. The hardened usercopy code got confused because the range's ending address is the same as the kernel's text starting address at 0xffff88103e000000. The overlap check is slightly off. Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Josh Poimboeuf Signed-off-by: Kees Cook Change-Id: I839dbf4ddbb4d9874026a42abed557eb9b3f8bef (cherry picked from commit 94cd97af690dd9537818dc9841d0ec68bb1dd877) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index f5c4c3a6f2df28..f78015e8b1e5cc 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -84,7 +84,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; From eb2c3e101bd2f7f594977065e6d2beb7f3207579 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Aug 2016 12:47:01 -0700 Subject: [PATCH 394/420] UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator The kernel test robot reported a usercopy failure in the new hardened sanity checks, due to a page-crossing copy of the FPU state into the task structure. This happened because the kernel test robot was testing with SLOB, which doesn't actually do the required book-keeping for slab allocations, and as a result the hardening code didn't realize that the task struct allocation was one single allocation - and the sanity checks fail. Since SLOB doesn't even claim to support hardening (and you really shouldn't use it), the straightforward solution is to just make the usercopy hardening code depend on the allocator supporting it. Reported-by: kernel test robot Cc: Kees Cook Signed-off-by: Linus Torvalds Change-Id: I37d51f866f873341bf7d5297249899b852e1c6ce (cherry picked from commit 6040e57658eee6eb1315a26119101ca832d1f854) Signed-off-by: Sami Tolvanen --- security/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/security/Kconfig b/security/Kconfig index 3412b0ec965e89..9b3773aa2a2060 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -144,6 +144,7 @@ config HAVE_ARCH_HARDENED_USERCOPY config HARDENED_USERCOPY bool "Harden memory copies between kernel and userspace" depends on HAVE_ARCH_HARDENED_USERCOPY + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR select BUG help This option checks for obviously wrong memory regions when From 5af78c213b0ba36763394e68538defb80a28fe47 Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Fri, 12 Jun 2015 14:36:28 -0700 Subject: [PATCH 395/420] cpuset: Make cpusets restore on hotplug This deliberately changes the behavior of the per-cpuset cpus file to not be effected by hotplug. When a cpu is offlined, it will be removed from the cpuset/cpus file. When a cpu is onlined, if the cpuset originally requested that that cpu was part of the cpuset, that cpu will be restored to the cpuset. The cpus files still have to be hierachical, but the ranges no longer have to be out of the currently online cpus, just the physically present cpus. Change-Id: I3efbae24a1f6384be1e603fb56f0d3baef61d924 --- kernel/cpuset.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cc319d0f04cce1..d9c9e28c465fbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,6 +99,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; + cpumask_var_t cpus_requested; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -385,7 +386,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_requested, q->cpus_requested) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -484,7 +485,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_requested, c->cpus_requested)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -926,17 +927,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) return -EINVAL; + + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); } /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; retval = validate_change(cs, trialcs); @@ -945,6 +947,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); mutex_unlock(&callback_mutex); /* use trialcs->cpus_allowed as a temp variable */ @@ -1717,7 +1720,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) switch (type) { case FILE_CPULIST: - s += cpulist_scnprintf(s, count, cs->cpus_allowed); + s += cpulist_scnprintf(s, count, cs->cpus_requested); break; case FILE_MEMLIST: s += nodelist_scnprintf(s, count, cs->mems_allowed); @@ -1917,11 +1920,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; + if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_allowed; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_cpus; + goto free_requested; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_requested); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); @@ -1930,7 +1936,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return &cs->css; -free_cpus: +free_requested: + free_cpumask_var(cs->cpus_requested); +free_allowed: free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); @@ -1992,6 +2000,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->cpus_requested, parent->cpus_requested); mutex_unlock(&callback_mutex); out_unlock: mutex_unlock(&cpuset_mutex); @@ -2025,6 +2034,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); kfree(cs); } @@ -2091,8 +2101,11 @@ int __init cpuset_init(void) BUG(); if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_requested); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); @@ -2226,7 +2239,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) goto retry; } - cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); From 5cc527677c81a09eb2418de6f691bf2b5e6f24a4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 4 Jan 2015 15:20:29 +0100 Subject: [PATCH 396/420] UPSTREAM: netfilter: nfnetlink: validate nfnetlink header from batch (cherry picked from commit 9ea2aa8b7dba9e99544c4187cc298face254569f) Make sure there is enough room for the nfnetlink header in the netlink messages that are part of the batch. There is a similar check in netlink_rcv_skb(). Signed-off-by: Pablo Neira Ayuso Change-Id: I3f0f74816a23eedb2ec4b803465ceab8d2542e55 --- net/netfilter/nfnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 13c2e17bbe279e..c6619d4bcc32c7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -321,7 +321,8 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_hdr(skb); err = 0; - if (nlh->nlmsg_len < NLMSG_HDRLEN) { + if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || + skb->len < nlh->nlmsg_len) { err = -EINVAL; goto ack; } From 0e03243dd86059fbceda68e3d62d4985906775a1 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 2 Feb 2016 13:36:45 -0500 Subject: [PATCH 397/420] BACKPORT: netfilter: nfnetlink: correctly validate length of batch messages (cherry picked from commit c58d6c93680f28ac58984af61d0a7ebf4319c241) If nlh->nlmsg_len is zero then an infinite loop is triggered because 'skb_pull(skb, msglen);' pulls zero bytes. The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len < NLMSG_HDRLEN' which bypasses the length validation and will later trigger an out-of-bound read. If the length validation does fail then the malformed batch message is copied back to userspace. However, we cannot do this because the nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in netlink_ack: [ 41.455421] ================================================================== [ 41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340 [ 41.456431] Read of size 4294967280 by task a.out/987 [ 41.456431] ============================================================================= [ 41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 41.456431] ----------------------------------------------------------------------------- ... [ 41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00 ................ [ 41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00 ............... [ 41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05 .......@EV."3... [ 41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb ................ ^^ start of batch nlmsg with nlmsg_len=4294967280 ... [ 41.456431] Memory state around the buggy address: [ 41.456431] ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ^ [ 41.456431] ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb [ 41.456431] ================================================================== Fix this with better validation of nlh->nlmsg_len and by setting NFNL_BATCH_FAILURE if any batch message fails length validation. CAP_NET_ADMIN is required to trigger the bugs. Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch") Signed-off-by: Phil Turnbull Signed-off-by: Pablo Neira Ayuso Change-Id: Id3e15c40cb464bf2791af907c235d8a316b2449c Bug: 30947055 --- net/netfilter/nfnetlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c6619d4bcc32c7..447407d5c1bc85 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -321,10 +321,12 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + success = false; + goto done; } /* Only requests are handled by the kernel */ From 18a9d04bd1df767b2651036fad9fc3c0e1b7f902 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Fri, 27 Nov 2015 14:30:21 -0500 Subject: [PATCH 398/420] UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields (cherry picked from commit dd42bf1197144ede075a9d4793123f7689e164bc) Line discipline drivers may mistakenly misuse ldisc-related fields when initializing. For example, a failure to initialize tty->receive_room in the N_GIGASET_M101 line discipline was recently found and fixed [1]. Now, the N_X25 line discipline has been discovered accessing the previous line discipline's already-freed private data [2]. Harden the ldisc interface against misuse by initializing revelant tty fields before instancing the new line discipline. [1] commit fd98e9419d8d622a4de91f76b306af6aa627aa9c Author: Tilman Schmidt Date: Tue Jul 14 00:37:13 2015 +0200 isdn/gigaset: reset tty->receive_room when attaching ser_gigaset [2] Report from Sasha Levin [ 634.336761] ================================================================== [ 634.338226] BUG: KASAN: use-after-free in x25_asy_open_tty+0x13d/0x490 at addr ffff8800a743efd0 [ 634.339558] Read of size 4 by task syzkaller_execu/8981 [ 634.340359] ============================================================================= [ 634.341598] BUG kmalloc-512 (Not tainted): kasan: bad access detected ... [ 634.405018] Call Trace: [ 634.405277] dump_stack (lib/dump_stack.c:52) [ 634.405775] print_trailer (mm/slub.c:655) [ 634.406361] object_err (mm/slub.c:662) [ 634.406824] kasan_report_error (mm/kasan/report.c:138 mm/kasan/report.c:236) [ 634.409581] __asan_report_load4_noabort (mm/kasan/report.c:279) [ 634.411355] x25_asy_open_tty (drivers/net/wan/x25_asy.c:559 (discriminator 1)) [ 634.413997] tty_ldisc_open.isra.2 (drivers/tty/tty_ldisc.c:447) [ 634.414549] tty_set_ldisc (drivers/tty/tty_ldisc.c:567) [ 634.415057] tty_ioctl (drivers/tty/tty_io.c:2646 drivers/tty/tty_io.c:2879) [ 634.423524] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607) [ 634.427491] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613) [ 634.427945] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188) Cc: Tilman Schmidt Cc: Sasha Levin Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Change-Id: Ibed6feadfb9706d478f93feec3b240aecfc64af3 Bug: 30951112 --- drivers/tty/tty_ldisc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 2d822aa259b2f7..2bf08366cd5b3d 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -414,6 +414,10 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush); * they are not on hot paths so a little discipline won't do * any harm. * + * The line discipline-related tty_struct fields are reset to + * prevent the ldisc driver from re-using stale information for + * the new ldisc instance. + * * Locking: takes termios_rwsem */ @@ -422,6 +426,9 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) down_write(&tty->termios_rwsem); tty->termios.c_line = num; up_write(&tty->termios_rwsem); + + tty->disc_data = NULL; + tty->receive_room = 0; } /** From 013635af55397e467c3950f1ed5ad48881861c53 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Fri, 30 Oct 2015 16:57:00 -0700 Subject: [PATCH 399/420] UPSTREAM: sg: Fix double-free when drives detach during SG_IO (cherry picked from commit f3951a3709ff50990bf3e188c27d346792103432) In sg_common_write(), we free the block request and return -ENODEV if the device is detached in the middle of the SG_IO ioctl(). Unfortunately, sg_finish_rem_req() also tries to free srp->rq, so we end up freeing rq->cmd in the already free rq object, and then free the object itself out from under the current user. This ends up corrupting random memory via the list_head on the rq object. The most common crash trace I saw is this: ------------[ cut here ]------------ kernel BUG at block/blk-core.c:1420! Call Trace: [] blk_put_request+0x5b/0x80 [] sg_finish_rem_req+0x6b/0x120 [sg] [] sg_common_write.isra.14+0x459/0x5a0 [sg] [] ? selinux_file_alloc_security+0x48/0x70 [] sg_new_write.isra.17+0x195/0x2d0 [sg] [] sg_ioctl+0x644/0xdb0 [sg] [] do_vfs_ioctl+0x90/0x520 [] ? file_has_perm+0x97/0xb0 [] SyS_ioctl+0x91/0xb0 [] tracesys+0xdd/0xe2 RIP [] __blk_put_request+0x154/0x1a0 The solution is straightforward: just set srp->rq to NULL in the failure branch so that sg_finish_rem_req() doesn't attempt to re-free it. Additionally, since sg_rq_end_io() will never be called on the object when this happens, we need to free memory backing ->cmd if it isn't embedded in the object itself. KASAN was extremely helpful in finding the root cause of this bug. Signed-off-by: Calvin Owens Acked-by: Douglas Gilbert Signed-off-by: Martin K. Petersen Change-Id: I905fb1e66eff9a919e5059934d5165acb6c39980 Bug: 30951599 --- drivers/scsi/sg.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 60354449d9ed1c..947a59273574cb 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -787,8 +787,14 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, return k; /* probably out of space --> ENOMEM */ } if (atomic_read(&sdp->detaching)) { - if (srp->bio) + if (srp->bio) { + if (srp->rq->cmd != srp->rq->__cmd) + kfree(srp->rq->cmd); + blk_end_request_all(srp->rq, -EIO); + srp->rq = NULL; + } + sg_finish_rem_req(srp); return -ENODEV; } From 8d6531f81176125eee3f228b980d04eeddc1f030 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 19 Jan 2016 12:34:58 +0100 Subject: [PATCH 400/420] UPSTREAM: HID: core: prevent out-of-bound readings (cherry picked from commit 50220dead1650609206efe91f0cc116132d59b3f) Plugging a Logitech DJ receiver with KASAN activated raises a bunch of out-of-bound readings. The fields are allocated up to MAX_USAGE, meaning that potentially, we do not have enough fields to fit the incoming values. Add checks and silence KASAN. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Change-Id: Iaf25e882a6696884439d7091b5fbb0b350d893d3 Bug: 30951261 --- drivers/hid/hid-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 3402033fa52a72..78894befe2b0e8 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1213,6 +1213,7 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && value[n] >= min && value[n] <= max && + value[n] - min < field->maxusage && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) goto exit; } @@ -1225,11 +1226,13 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, } if (field->value[n] >= min && field->value[n] <= max + && field->value[n] - min < field->maxusage && field->usage[field->value[n] - min].hid && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (value[n] >= min && value[n] <= max + && value[n] - min < field->maxusage && field->usage[value[n] - min].hid && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); From 49b4271a7a0159f3fc4a508a136b4c60abd9932f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 1 Jul 2016 00:39:35 -0700 Subject: [PATCH 401/420] UPSTREAM: block: fix use-after-free in sys_ioprio_get() (cherry picked from commit 8ba8682107ee2ca3347354e018865d8e1967c5f4) get_task_ioprio() accesses the task->io_context without holding the task lock and thus can race with exit_io_context(), leading to a use-after-free. The reproducer below hits this within a few seconds on my 4-core QEMU VM: #define _GNU_SOURCE #include #include #include #include int main(int argc, char **argv) { pid_t pid, child; long nproc, i; /* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */ syscall(SYS_ioprio_set, 1, 0, 0x6000); nproc = sysconf(_SC_NPROCESSORS_ONLN); for (i = 0; i < nproc; i++) { pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { pid = fork(); assert(pid != -1); if (pid == 0) { _exit(0); } else { child = wait(NULL); assert(child == pid); } } } pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } } } for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } return 0; } This gets us KASAN dumps like this: [ 35.526914] ================================================================== [ 35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c [ 35.530009] Read of size 2 by task ioprio-gpf/363 [ 35.530009] ============================================================================= [ 35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected [ 35.530009] ----------------------------------------------------------------------------- [ 35.530009] Disabling lock debugging due to kernel taint [ 35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360 [ 35.530009] ___slab_alloc+0x55d/0x5a0 [ 35.530009] __slab_alloc.isra.20+0x2b/0x40 [ 35.530009] kmem_cache_alloc_node+0x84/0x200 [ 35.530009] create_task_io_context+0x2b/0x370 [ 35.530009] get_task_io_context+0x92/0xb0 [ 35.530009] copy_process.part.8+0x5029/0x5660 [ 35.530009] _do_fork+0x155/0x7e0 [ 35.530009] SyS_clone+0x19/0x20 [ 35.530009] do_syscall_64+0x195/0x3a0 [ 35.530009] return_from_SYSCALL_64+0x0/0x6a [ 35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060 [ 35.530009] __slab_free+0x27b/0x3d0 [ 35.530009] kmem_cache_free+0x1fb/0x220 [ 35.530009] put_io_context+0xe7/0x120 [ 35.530009] put_io_context_active+0x238/0x380 [ 35.530009] exit_io_context+0x66/0x80 [ 35.530009] do_exit+0x158e/0x2b90 [ 35.530009] do_group_exit+0xe5/0x2b0 [ 35.530009] SyS_exit_group+0x1d/0x20 [ 35.530009] entry_SYSCALL_64_fastpath+0x1a/0xa4 [ 35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080 [ 35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001 [ 35.530009] ================================================================== Fix it by grabbing the task lock while we poke at the io_context. Cc: stable@vger.kernel.org Reported-by: Dmitry Vyukov Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Change-Id: I3f5858cc9a1b9d4124ae7a6578660dec219d2c57 Bug: 30946378 --- block/ioprio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/ioprio.c b/block/ioprio.c index 31666c92b46af2..563435684c3c15 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -149,8 +149,10 @@ static int get_task_ioprio(struct task_struct *p) if (ret) goto out; ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + task_unlock(p); out: return ret; } From 2d7c630bb31aa96da30ccaa2c4be6c4c3bef59e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Sep 2016 11:56:01 -0700 Subject: [PATCH 402/420] UPSTREAM: x86/uaccess: force copy_*_user() to be inlined As already done with __copy_*_user(), mark copy_*_user() as __always_inline. Without this, the checks for things like __builtin_const_p() won't work consistently in either hardened usercopy nor the recent adjustments for detecting usercopy overflows at compile time. The change in kernel text size is detectable, but very small: text data bss dec hex filename 12118735 5768608 14229504 32116847 1ea106f vmlinux.before 12120207 5768608 14229504 32118319 1ea162f vmlinux.after Signed-off-by: Kees Cook Change-Id: I284c85c2a782145f46655a91d4f83874c90eba61 (cherry picked from commit e6971009a95a74f28c58bbae415c40effad1226c) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index fe1f8bfc27870b..8e8224eed86422 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -701,7 +701,7 @@ __copy_from_user_overflow(int size, unsigned long count) #endif -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -737,7 +737,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); From 7b9b4f3db2ae4521a048a6d0235a25fc383e038f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 31 Aug 2016 16:04:21 -0700 Subject: [PATCH 403/420] BACKPORT: usercopy: fold builtin_const check into inline function Instead of having each caller of check_object_size() need to remember to check for a const size parameter, move the check into check_object_size() itself. This actually matches the original implementation in PaX, though this commit cleans up the now-redundant builtin_const() calls in the various architectures. Signed-off-by: Kees Cook Change-Id: I348809399c10ffa051251866063be674d064b9ff (cherry picked from 81409e9e28058811c9ea865345e1753f8f677e44) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0ae29ff9ccfde0..eded095fe81e5c 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -161,7 +161,8 @@ extern void __check_object_size(const void *ptr, unsigned long n, static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { - __check_object_size(ptr, n, to_user); + if (!__builtin_constant_p(n)) + __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, From ac7860df2dfde83f2cc237aa197ea543bc8739d9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:39:32 -0700 Subject: [PATCH 404/420] UPSTREAM: usercopy: force check_object_size() inline Just for good measure, make sure that check_object_size() is always inlined too, as already done for copy_*_user() and __copy_*_user(). Suggested-by: Linus Torvalds Signed-off-by: Kees Cook Change-Id: Ibfdf4790d03fe426e68d9a864c55a0d1bbfb7d61 (cherry picked from commit a85d6b8242dc78ef3f4542a0f979aebcbe77fc4e) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index eded095fe81e5c..4cf89517783ab8 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -158,8 +158,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); From d6912fa2dd90963280c6ba02ba4c387b05941946 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:54:34 -0700 Subject: [PATCH 405/420] UPSTREAM: usercopy: remove page-spanning test for now A custom allocator without __GFP_COMP that copies to userspace has been found in vmw_execbuf_process[1], so this disables the page-span checker by placing it behind a CONFIG for future work where such things can be tracked down later. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1373326 Reported-by: Vinson Lee Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Kees Cook Change-Id: I4177c0fb943f14a5faf5c70f5e54bf782c316f43 (cherry picked from commit 8e1f74ea02cf4562404c48c6882214821552c13f) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 61 +++++++++++++++++++++++++++--------------------- security/Kconfig | 11 +++++++++ 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index f78015e8b1e5cc..b34996a3860b0b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -135,30 +135,15 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n) return NULL; } -static inline const char *check_heap_object(const void *ptr, unsigned long n, - bool to_user) +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) { - struct page *page, *endpage; +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN const void *end = ptr + n - 1; + struct page *endpage; bool is_reserved, is_cma; - /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - */ - if (is_vmalloc_addr(ptr)) - return NULL; - - if (!virt_addr_valid(ptr)) - return NULL; - - page = virt_to_head_page(ptr); - - /* Check slab allocator for flags and size. */ - if (PageSlab(page)) - return __check_heap_object(ptr, n, page); - /* * Sometimes the kernel data regions are not marked Reserved (see * check below). And sometimes [_sdata,_edata) does not cover @@ -187,7 +172,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, ((unsigned long)end & (unsigned long)PAGE_MASK))) return NULL; - /* Allow if start and end are inside the same compound page. */ + /* Allow if fully inside the same compound (__GFP_COMP) page. */ endpage = virt_to_head_page(end); if (likely(endpage == page)) return NULL; @@ -200,20 +185,44 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, is_reserved = PageReserved(page); is_cma = is_migrate_cma_page(page); if (!is_reserved && !is_cma) - goto reject; + return ""; for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { page = virt_to_head_page(ptr); if (is_reserved && !PageReserved(page)) - goto reject; + return ""; if (is_cma && !is_migrate_cma_page(page)) - goto reject; + return ""; } +#endif return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); -reject: - return ""; + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); } /* diff --git a/security/Kconfig b/security/Kconfig index 9b3773aa2a2060..601882ce946a9b 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -155,6 +155,17 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. +config HARDENED_USERCOPY_PAGESPAN + bool "Refuse to copy allocations that span multiple pages" + depends on HARDENED_USERCOPY + depends on !COMPILE_TEST + help + When a multi-page allocation is done without __GFP_COMP, + hardened usercopy will reject attempts to copy it. There are, + however, several cases of this in the kernel that have not all + been removed. This config is intended to be used only while + trying to find such users. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 3230c5d2d58f59d91e60e751a78fb6dbfcb8d1b2 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 31 Aug 2016 08:09:04 -0700 Subject: [PATCH 406/420] FROMLIST: pstore: drop pmsg bounce buffer (from https://lkml.org/lkml/2016/9/1/428) (cherry pick from android-3.10 commit b58133100b38f2bf83cad2d7097417a3a196ed0b) Removing a bounce buffer copy operation in the pmsg driver path is always better. We also gain in overall performance by not requesting a vmalloc on every write as this can cause precious RT tasks, such as user facing media operation, to stall while memory is being reclaimed. Added a write_buf_user to the pstore functions, a backup platform write_buf_user that uses the small buffer that is part of the instance, and implemented a ramoops write_buf_user that only supports PSTORE_TYPE_PMSG. Signed-off-by: Mark Salyzyn Bug: 31057326 Change-Id: I4cdee1cd31467aa3e6c605bce2fbd4de5b0f8caa --- fs/pstore/platform.c | 36 +++++++++++++++++++++++++++++ fs/pstore/pmsg.c | 35 +++++----------------------- fs/pstore/ram.c | 19 +++++++++++++++ fs/pstore/ram_core.c | 47 ++++++++++++++++++++++++++++++++++++-- include/linux/pstore.h | 11 ++++++--- include/linux/pstore_ram.h | 7 ++++-- 6 files changed, 119 insertions(+), 36 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 15ee78c5020b24..ad1fe993d2d072 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -404,6 +404,40 @@ static int pstore_write_compat(enum pstore_type_id type, size, psi); } +static int pstore_write_buf_user_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + unsigned long flags = 0; + size_t i, bufsize = size; + long ret = 0; + + if (unlikely(!access_ok(VERIFY_READ, buf, size))) + return -EFAULT; + if (bufsize > psinfo->bufsize) + bufsize = psinfo->bufsize; + spin_lock_irqsave(&psinfo->buf_lock, flags); + for (i = 0; i < size; ) { + size_t c = min(size - i, bufsize); + + ret = __copy_from_user(psinfo->buf, buf + i, c); + if (unlikely(ret != 0)) { + ret = -EFAULT; + break; + } + ret = psi->write_buf(type, reason, id, part, psinfo->buf, + compressed, c, psi); + if (unlikely(ret < 0)) + break; + i += c; + } + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + return unlikely(ret < 0) ? ret : size; +} + /* * platform specific persistent storage driver registers with * us here. If pstore is already mounted, call the platform @@ -428,6 +462,8 @@ int pstore_register(struct pstore_info *psi) if (!psi->write) psi->write = pstore_write_compat; + if (!psi->write_buf_user) + psi->write_buf_user = pstore_write_buf_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 5a2f05a16c1e3a..64b97a2966d959 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -19,48 +19,25 @@ #include "internal.h" static DEFINE_MUTEX(pmsg_lock); -#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - size_t i, buffer_size; - char *buffer; + u64 id; + int ret; if (!count) return 0; + /* check outside lock, page in any data. write_buf_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - buffer_size = count; - if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) - buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; - buffer = vmalloc(buffer_size); - if (!buffer) - return -ENOMEM; - mutex_lock(&pmsg_lock); - for (i = 0; i < count; ) { - size_t c = min(count - i, buffer_size); - u64 id; - long ret; - - ret = __copy_from_user(buffer, buf + i, c); - if (unlikely(ret != 0)) { - mutex_unlock(&pmsg_lock); - vfree(buffer); - return -EFAULT; - } - psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, - psinfo); - - i += c; - } - + ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, + psinfo); mutex_unlock(&pmsg_lock); - vfree(buffer); - return count; + return ret ? ret : count; } static const struct file_operations pmsg_fops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 72b4252562ce83..cfa0a127febe0a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -299,6 +299,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } +static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + if (type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = psi->data; + + if (!cxt->mprz) + return -ENOMEM; + return persistent_ram_write_user(cxt->mprz, buf, size); + } + + return -EINVAL; +} + static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi) { @@ -337,6 +355,7 @@ static struct ramoops_context oops_cxt = { .open = ramoops_pstore_open, .read = ramoops_pstore_read, .write_buf = ramoops_pstore_write_buf, + .write_buf_user = ramoops_pstore_write_buf_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 9d7b9a83699e42..8b89f164c28c57 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,15 +17,16 @@ #include #include #include -#include #include #include +#include #include #include +#include #include #include +#include #include -#include #include struct persistent_ram_buffer { @@ -303,6 +304,16 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, persistent_ram_update_ecc(prz, start, count); } +static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + -EFAULT : 0; + persistent_ram_update_ecc(prz, start, count); + return ret; +} + void persistent_ram_save_old(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; @@ -356,6 +367,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz, return count; } +int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count) +{ + int rem, ret = 0, c = count; + size_t start; + + if (unlikely(!access_ok(VERIFY_READ, s, count))) + return -EFAULT; + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + ret = persistent_ram_update_user(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + if (likely(!ret)) + ret = persistent_ram_update_user(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return unlikely(ret) ? ret : count; +} + size_t persistent_ram_old_size(struct persistent_ram_zone *prz) { return prz->old_log_size; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 8884f6e507f7c7..2eec52d597ce36 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -22,12 +22,13 @@ #ifndef _LINUX_PSTORE_H #define _LINUX_PSTORE_H -#include +#include +#include #include #include -#include #include -#include +#include +#include /* types */ enum pstore_type_id { @@ -66,6 +67,10 @@ struct pstore_info { enum kmsg_dump_reason reason, u64 *id, unsigned int part, const char *buf, bool compressed, size_t size, struct pstore_info *psi); + int (*write_buf_user)(enum pstore_type_id type, + enum kmsg_dump_reason reason, u64 *id, + unsigned int part, const char __user *buf, + bool compressed, size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi); diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 2c8e5dd7a762b8..88790cf63ff382 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -17,11 +17,12 @@ #ifndef __LINUX_PSTORE_RAM_H__ #define __LINUX_PSTORE_RAM_H__ +#include #include +#include #include #include #include -#include struct persistent_ram_buffer; struct rs_control; @@ -58,7 +59,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz); void persistent_ram_zap(struct persistent_ram_zone *prz); int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, - unsigned int count); + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); void persistent_ram_save_old(struct persistent_ram_zone *prz); size_t persistent_ram_old_size(struct persistent_ram_zone *prz); From 7d2542f24942fbdb87064168d0106f54f1b8f676 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Mar 2016 12:14:49 +0100 Subject: [PATCH 407/420] BACKPORT: ALSA: usb-audio: Minor code cleanup in create_fixed_stream_quirk() (cherry picked from commit 902eb7fd1e4af3ac69b9b30f8373f118c92b9729) Just a minor code cleanup: unify the error paths. Signed-off-by: Takashi Iwai Change-Id: I8253a86235df2ac1258153c9e128fa158527567f Bug: 30952477 --- sound/usb/quirks.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 60dfe0d28771bb..387ee38b7d4f7a 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -155,16 +155,12 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, stream = (fp->endpoint & USB_DIR_IN) ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; err = snd_usb_add_audio_stream(chip, stream, fp); - if (err < 0) { - kfree(fp); - kfree(rate_table); - return err; - } + if (err < 0) + goto error; if (fp->iface != get_iface_desc(&iface->altsetting[0])->bInterfaceNumber || fp->altset_idx >= iface->num_altsetting) { - kfree(fp); - kfree(rate_table); - return -EINVAL; + err = -EINVAL; + goto error; } alts = &iface->altsetting[fp->altset_idx]; altsd = get_iface_desc(alts); @@ -178,6 +174,11 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, snd_usb_init_pitch(chip, fp->iface, alts, fp); snd_usb_init_sample_rate(chip, fp->iface, alts, fp, fp->rate_max); return 0; + + error: + kfree(fp); + kfree(rate_table); + return err; } static int create_auto_pcm_quirk(struct snd_usb_audio *chip, From ed1dbfc307117a31302af0d067138dad3abbe133 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Thu, 31 Mar 2016 12:05:43 -0400 Subject: [PATCH 408/420] UPSTREAM: ALSA: usb-audio: Fix double-free in error paths after snd_usb_add_audio_stream() call (cherry picked from commit 836b34a935abc91e13e63053d0a83b24dfb5ea78) create_fixed_stream_quirk(), snd_usb_parse_audio_interface() and create_uaxx_quirk() functions allocate the audioformat object by themselves and free it upon error before returning. However, once the object is linked to a stream, it's freed again in snd_usb_audio_pcm_free(), thus it'll be double-freed, eventually resulting in a memory corruption. This patch fixes these failures in the error paths by unlinking the audioformat object before freeing it. Based on a patch by Takashi Iwai [Note for stable backports: this patch requires the commit 902eb7fd1e4a ('ALSA: usb-audio: Minor code cleanup in create_fixed_stream_quirk()')] Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1283358 Reported-by: Ralf Spenneberg Cc: # see the note above Signed-off-by: Vladis Dronov Signed-off-by: Takashi Iwai Change-Id: I7073a17d8c99886d2f6ed7981892712ba7dd5873 Bug: 30952477 --- sound/usb/quirks.c | 4 ++++ sound/usb/stream.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 387ee38b7d4f7a..725f44df7b9233 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -138,6 +138,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, usb_audio_err(chip, "cannot memdup\n"); return -ENOMEM; } + INIT_LIST_HEAD(&fp->list); if (fp->nr_rates > MAX_NR_RATES) { kfree(fp); return -EINVAL; @@ -176,6 +177,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, return 0; error: + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); kfree(rate_table); return err; @@ -451,6 +453,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip, fp->ep_attr = get_endpoint(alts, 0)->bmAttributes; fp->datainterval = 0; fp->maxpacksize = le16_to_cpu(get_endpoint(alts, 0)->wMaxPacketSize); + INIT_LIST_HEAD(&fp->list); switch (fp->maxpacksize) { case 0x120: @@ -474,6 +477,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip, ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) { + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); return err; } diff --git a/sound/usb/stream.c b/sound/usb/stream.c index 310a3822d2b72b..25e8075f9ea3f8 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -315,7 +315,9 @@ static struct snd_pcm_chmap_elem *convert_chmap(int channels, unsigned int bits, /* * add this endpoint to the chip instance. * if a stream with the same endpoint already exists, append to it. - * if not, create a new pcm stream. + * if not, create a new pcm stream. note, fp is added to the substream + * fmt_list and will be freed on the chip instance release. do not free + * fp or do remove it from the substream fmt_list to avoid double-free. */ int snd_usb_add_audio_stream(struct snd_usb_audio *chip, int stream, @@ -668,6 +670,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no) * (fp->maxpacksize & 0x7ff); fp->attributes = parse_uac_endpoint_attributes(chip, alts, protocol, iface_no); fp->clock = clock; + INIT_LIST_HEAD(&fp->list); /* some quirks for attributes here */ @@ -716,6 +719,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no) dev_dbg(&dev->dev, "%u:%d: add audio endpoint %#x\n", iface_no, altno, fp->endpoint); err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) { + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp->rate_table); kfree(fp->chmap); kfree(fp); From cbad12fbe2c1b8efe6428960898491617ad255f9 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 23 Aug 2016 11:32:37 -0700 Subject: [PATCH 409/420] ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module Exports the device mapper callbacks of linear and dm-verity-target methods. (Cherry-picked from https://android-review.googlesource.com/#/c/261333/) Signed-off-by: Badhri Jagan Sridharan Change-Id: I0358be0615c431dce3cc78575aaac4ccfe3aacd7 --- drivers/md/Kconfig | 3 ++- drivers/md/Makefile | 5 +---- drivers/md/dm-linear.c | 6 ++++++ drivers/md/dm-verity-target.c | 7 +++++++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2dbd068474dd95..00c43e29168007 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -419,7 +419,7 @@ config DM_VERITY If unsure, say N. config DM_ANDROID_VERITY - bool "Android verity target support" + tristate "Android verity target support" depends on DM_VERITY depends on X509_CERTIFICATE_PARSER depends on SYSTEM_TRUSTED_KEYRING @@ -427,6 +427,7 @@ config DM_ANDROID_VERITY depends on KEYS depends on ASYMMETRIC_KEY_TYPE depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + depends on MD_LINEAR ---help--- This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 07604e94bf5e61..efea0935a4d092 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o +obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o @@ -64,7 +65,3 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif - -ifeq ($(CONFIG_DM_ANDROID_VERITY),y) -dm-verity-objs += dm-android-verity.o -endif diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d42f1093ad60fc..a734af0db45ac5 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -63,6 +63,7 @@ int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(lc); return -EINVAL; } +EXPORT_SYMBOL_GPL(dm_linear_ctr); void dm_linear_dtr(struct dm_target *ti) { @@ -71,6 +72,7 @@ void dm_linear_dtr(struct dm_target *ti) dm_put_device(ti, lc->dev); kfree(lc); } +EXPORT_SYMBOL_GPL(dm_linear_dtr); static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { @@ -95,6 +97,7 @@ int dm_linear_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +EXPORT_SYMBOL_GPL(dm_linear_map); void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -112,6 +115,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(dm_linear_status); int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) @@ -144,6 +148,7 @@ int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl); int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -152,6 +157,7 @@ int dm_linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +EXPORT_SYMBOL_GPL(dm_linear_iterate_devices); static struct target_type linear_target = { .name = "linear", diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 8b747528b02d40..98ee3786a1cabd 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -593,6 +593,7 @@ int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +EXPORT_SYMBOL_GPL(verity_map); /* * Status: V (valid) or C (corruption found) @@ -656,6 +657,7 @@ void verity_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(verity_status); int verity_ioctl(struct dm_target *ti, unsigned cmd, unsigned long arg) @@ -685,6 +687,7 @@ int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +EXPORT_SYMBOL_GPL(verity_prepare_ioctl); int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -693,6 +696,7 @@ int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } +EXPORT_SYMBOL_GPL(verity_iterate_devices); void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { @@ -706,6 +710,7 @@ void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } +EXPORT_SYMBOL_GPL(verity_io_hints); void verity_dtr(struct dm_target *ti) { @@ -736,6 +741,7 @@ void verity_dtr(struct dm_target *ti) kfree(v); } +EXPORT_SYMBOL_GPL(verity_dtr); static int verity_alloc_zero_digest(struct dm_verity *v) { @@ -1072,6 +1078,7 @@ int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return r; } +EXPORT_SYMBOL_GPL(verity_ctr); static struct target_type verity_target = { .name = "verity", From 7877b96891c57188cd2e0452a16badcf381dfeb1 Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Sat, 29 Nov 2014 01:33:30 +0800 Subject: [PATCH 410/420] BACKPORT: ARM: 8235/1: Support for the PXN CPU feature on ARMv7 Modern ARMv7-A/R cores optionally implement below new hardware feature: - PXN: Privileged execute-never(PXN) is a security feature. PXN bit determines whether the processor can execute software from the region. This is effective solution against ret2usr attack. On an implementation that does not include the LPAE, PXN is optionally supported. This patch set PXN bit on user page table for preventing user code execution with privilege mode. Reviewed-by: Catalin Marinas Signed-off-by: Jungseung Lee Signed-off-by: Russell King Bug: 31161206 Change-Id: Id87d6e8b1874f7976e0e48e4b8bb0e65856b4382 --- arch/arm/include/asm/pgalloc.h | 10 +++++++- arch/arm/include/asm/pgtable-2level-hwdef.h | 2 ++ arch/arm/include/asm/pgtable-3level-hwdef.h | 1 + arch/arm/mm/mmu.c | 27 +++++++++++++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 78a779361682ad..46e879defec5a3 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -157,7 +157,15 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { - __pmd_populate(pmdp, page_to_phys(ptep), _PAGE_USER_TABLE); + extern pmdval_t user_pmd_table; + pmdval_t prot; + + if (__LINUX_ARM_ARCH__ >= 6 && !IS_ENABLED(CONFIG_ARM_LPAE)) + prot = user_pmd_table; + else + prot = _PAGE_USER_TABLE; + + __pmd_populate(pmdp, page_to_phys(ptep), prot); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/arch/arm/include/asm/pgtable-2level-hwdef.h b/arch/arm/include/asm/pgtable-2level-hwdef.h index 5cfba15cb401e5..6c0bb405f69fb3 100644 --- a/arch/arm/include/asm/pgtable-2level-hwdef.h +++ b/arch/arm/include/asm/pgtable-2level-hwdef.h @@ -20,12 +20,14 @@ #define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 1) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 2) << 0) +#define PMD_PXNTABLE (_AT(pmdval_t, 1) << 2) /* v7 */ #define PMD_BIT4 (_AT(pmdval_t, 1) << 4) #define PMD_DOMAIN(x) (_AT(pmdval_t, (x)) << 5) #define PMD_PROTECTION (_AT(pmdval_t, 1) << 9) /* v5 */ /* * - section */ +#define PMD_SECT_PXN (_AT(pmdval_t, 1) << 0) /* v7 */ #define PMD_SECT_BUFFERABLE (_AT(pmdval_t, 1) << 2) #define PMD_SECT_CACHEABLE (_AT(pmdval_t, 1) << 3) #define PMD_SECT_XN (_AT(pmdval_t, 1) << 4) /* v6 */ diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h index 9fd61c72a33a14..db54640e591d4c 100644 --- a/arch/arm/include/asm/pgtable-3level-hwdef.h +++ b/arch/arm/include/asm/pgtable-3level-hwdef.h @@ -76,6 +76,7 @@ #define PTE_EXT_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define PTE_EXT_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ #define PTE_EXT_NG (_AT(pteval_t, 1) << 11) /* nG */ +#define PTE_EXT_PXN (_AT(pteval_t, 1) << 53) /* PXN */ #define PTE_EXT_XN (_AT(pteval_t, 1) << 54) /* XN */ /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index a7b12cb21e816e..33da5c75b6cb61 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -53,6 +53,8 @@ EXPORT_SYMBOL(empty_zero_page); */ pmd_t *top_pmd; +pmdval_t user_pmd_table = _PAGE_USER_TABLE; + #define CPOLICY_UNCACHED 0 #define CPOLICY_BUFFERED 1 #define CPOLICY_WRITETHROUGH 2 @@ -466,6 +468,26 @@ static void __init build_mem_type_table(void) for (i = 0; i < ARRAY_SIZE(mem_types); i++) mem_types[i].prot_sect &= ~PMD_SECT_S; +#ifndef CONFIG_ARM_LPAE + /* + * We don't use domains on ARMv6 (since this causes problems with + * v6/v7 kernels), so we must use a separate memory type for user + * r/o, kernel r/w to map the vectors page. + */ + if (cpu_arch == CPU_ARCH_ARMv6) + vecs_pgprot |= L_PTE_MT_VECTORS; + + + /* + * Check is it with support for the PXN bit + * in the Short-descriptor translation table format descriptors. + */ + if (cpu_arch == CPU_ARCH_ARMv7 && + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + user_pmd_table |= PMD_PXNTABLE; + } +#endif + /* * ARMv5 and lower, bit 4 must be set for page tables (was: cache * "update-able on write" bit on ARM610). However, Xscale and @@ -629,6 +651,11 @@ static void __init build_mem_type_table(void) } kern_pgprot |= PTE_EXT_AF; vecs_pgprot |= PTE_EXT_AF; + + /* + * Set PXN for user mappings + */ + user_pgprot |= PTE_EXT_PXN; #endif for (i = 0; i < 16; i++) { From 191df7572463700d6c086b036707bf890eb93c8b Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Tue, 29 Dec 2015 04:47:00 +0800 Subject: [PATCH 411/420] UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor The VMSA field of MMFR0 (bottom 4 bits) is incremented for each added feature. PXN is supported if the value is >= 4 and LPAE is supported if it is >= 5. In case a kernel with CONFIG_ARM_LPAE disabled is used on a processor that supports LPAE, we can still use PXN in short descriptors. So check for >= 4 not == 4. Signed-off-by: Jungseung Lee Acked-by: Catalin Marinas Signed-off-by: Ben Hutchings Signed-off-by: Russell King Signed-off-by: citypw Bug: 31161206 Change-Id: Ie4bfc062b72519ec9c8d2a0baf0353526033716d --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 33da5c75b6cb61..24cbdb92f0e17b 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -483,7 +483,7 @@ static void __init build_mem_type_table(void) * in the Short-descriptor translation table format descriptors. */ if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) { user_pmd_table |= PMD_PXNTABLE; } #endif From 5753ff28d17e3b947ae8a94e8aaa456f42425136 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 12 Sep 2016 11:54:22 -0700 Subject: [PATCH 412/420] ANDROID: dm: Fix symbol exports for dm target callbacks dm-linear and dm-verity-target had incorrect callbacks exposed through EXPORT_SYMBOL_GPL macros. This CL fixes them. Signed-off-by: Badhri Jagan Sridharan Change-Id: I6e4befab0b61f04829b27aaf31a457b63c1cf041 --- drivers/md/dm-linear.c | 3 ++- drivers/md/dm-verity-target.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index a734af0db45ac5..acffc181b56810 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -133,6 +133,7 @@ int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); } +EXPORT_SYMBOL_GPL(dm_linear_ioctl); int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, struct bio_vec *biovec, int max_size) @@ -148,7 +149,7 @@ int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl); +EXPORT_SYMBOL_GPL(dm_linear_merge); int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 98ee3786a1cabd..eb416ee1b9157e 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -672,6 +672,7 @@ int verity_ioctl(struct dm_target *ti, unsigned cmd, return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, cmd, arg); } +EXPORT_SYMBOL_GPL(verity_ioctl); int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, struct bio_vec *biovec, int max_size) @@ -687,7 +688,7 @@ int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -EXPORT_SYMBOL_GPL(verity_prepare_ioctl); +EXPORT_SYMBOL_GPL(verity_merge); int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) From 7447458426a45a8c5784942559612c37c9fa0693 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Dec 2015 13:49:05 +0100 Subject: [PATCH 413/420] UPSTREAM: perf: Fix race in swevent hash (cherry picked from commit 12ca6ad2e3a896256f086497a7c7406a547ee373) There's a race on CPU unplug where we free the swevent hash array while it can still have events on. This will result in a use-after-free which is BAD. Simply do not free the hash array on unplug. This leaves the thing around and no use-after-free takes place. When the last swevent dies, we do a for_each_possible_cpu() iteration anyway to clean these up, at which time we'll free it, so no leakage will occur. Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar Change-Id: I4972ce74211b6504ff61325c4a4f7b088306d1f9 Bug: 30952077 --- kernel/events/core.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index da4ed38336b39f..6ac16003c284de 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5659,9 +5659,6 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; - - /* Keeps track of cpu being initialized/exited */ - bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5908,14 +5905,8 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (!head) { - /* - * We can race with cpu hotplug code. Do not - * WARN if the cpu just got unplugged. - */ - WARN_ON_ONCE(swhash->online); + if (WARN_ON_ONCE(!head)) return -EINVAL; - } hlist_add_head_rcu(&event->hlist_entry, head); @@ -5982,7 +5973,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; @@ -8104,7 +8094,6 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -8157,14 +8146,7 @@ static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu(int cpu) { - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - perf_event_exit_cpu_context(cpu); - - mutex_lock(&swhash->hlist_mutex); - swhash->online = false; - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); } #else static inline void perf_event_exit_cpu(int cpu) { } From cd6ae1f3fa72b4d91d5c7346410d0fef28ddb6c0 Mon Sep 17 00:00:00 2001 From: Jeffrey Vander Stoep Date: Tue, 13 Sep 2016 14:42:58 +0000 Subject: [PATCH 414/420] Revert "UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor The VMSA field of MMFR0 (bottom 4 bits) is incremented for each added feature. PXN is supported if the value is >= 4 and LPAE is supported if it is >= 5." This reverts commit 191df7572463700d6c086b036707bf890eb93c8b. Change-Id: I68482c7a33a4e8efd52eb9e25c48adfbea399d15 --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 24cbdb92f0e17b..33da5c75b6cb61 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -483,7 +483,7 @@ static void __init build_mem_type_table(void) * in the Short-descriptor translation table format descriptors. */ if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) { + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { user_pmd_table |= PMD_PXNTABLE; } #endif From 7a87b713c4548d9f5a981b417e5cec8a722db759 Mon Sep 17 00:00:00 2001 From: Jeffrey Vander Stoep Date: Tue, 13 Sep 2016 14:43:03 +0000 Subject: [PATCH 415/420] Revert "BACKPORT: ARM: 8235/1: Support for the PXN CPU feature on ARMv7 Modern ARMv7-A/R cores optionally implement below new hardware feature:" This reverts commit 7877b96891c57188cd2e0452a16badcf381dfeb1. Change-Id: Ida1269d70fcea3e90cac9c0fa12dbff2e95795d5 --- arch/arm/include/asm/pgalloc.h | 10 +------- arch/arm/include/asm/pgtable-2level-hwdef.h | 2 -- arch/arm/include/asm/pgtable-3level-hwdef.h | 1 - arch/arm/mm/mmu.c | 27 --------------------- 4 files changed, 1 insertion(+), 39 deletions(-) diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 46e879defec5a3..78a779361682ad 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -157,15 +157,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { - extern pmdval_t user_pmd_table; - pmdval_t prot; - - if (__LINUX_ARM_ARCH__ >= 6 && !IS_ENABLED(CONFIG_ARM_LPAE)) - prot = user_pmd_table; - else - prot = _PAGE_USER_TABLE; - - __pmd_populate(pmdp, page_to_phys(ptep), prot); + __pmd_populate(pmdp, page_to_phys(ptep), _PAGE_USER_TABLE); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/arch/arm/include/asm/pgtable-2level-hwdef.h b/arch/arm/include/asm/pgtable-2level-hwdef.h index 6c0bb405f69fb3..5cfba15cb401e5 100644 --- a/arch/arm/include/asm/pgtable-2level-hwdef.h +++ b/arch/arm/include/asm/pgtable-2level-hwdef.h @@ -20,14 +20,12 @@ #define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 1) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 2) << 0) -#define PMD_PXNTABLE (_AT(pmdval_t, 1) << 2) /* v7 */ #define PMD_BIT4 (_AT(pmdval_t, 1) << 4) #define PMD_DOMAIN(x) (_AT(pmdval_t, (x)) << 5) #define PMD_PROTECTION (_AT(pmdval_t, 1) << 9) /* v5 */ /* * - section */ -#define PMD_SECT_PXN (_AT(pmdval_t, 1) << 0) /* v7 */ #define PMD_SECT_BUFFERABLE (_AT(pmdval_t, 1) << 2) #define PMD_SECT_CACHEABLE (_AT(pmdval_t, 1) << 3) #define PMD_SECT_XN (_AT(pmdval_t, 1) << 4) /* v6 */ diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h index db54640e591d4c..9fd61c72a33a14 100644 --- a/arch/arm/include/asm/pgtable-3level-hwdef.h +++ b/arch/arm/include/asm/pgtable-3level-hwdef.h @@ -76,7 +76,6 @@ #define PTE_EXT_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define PTE_EXT_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ #define PTE_EXT_NG (_AT(pteval_t, 1) << 11) /* nG */ -#define PTE_EXT_PXN (_AT(pteval_t, 1) << 53) /* PXN */ #define PTE_EXT_XN (_AT(pteval_t, 1) << 54) /* XN */ /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 33da5c75b6cb61..a7b12cb21e816e 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -53,8 +53,6 @@ EXPORT_SYMBOL(empty_zero_page); */ pmd_t *top_pmd; -pmdval_t user_pmd_table = _PAGE_USER_TABLE; - #define CPOLICY_UNCACHED 0 #define CPOLICY_BUFFERED 1 #define CPOLICY_WRITETHROUGH 2 @@ -468,26 +466,6 @@ static void __init build_mem_type_table(void) for (i = 0; i < ARRAY_SIZE(mem_types); i++) mem_types[i].prot_sect &= ~PMD_SECT_S; -#ifndef CONFIG_ARM_LPAE - /* - * We don't use domains on ARMv6 (since this causes problems with - * v6/v7 kernels), so we must use a separate memory type for user - * r/o, kernel r/w to map the vectors page. - */ - if (cpu_arch == CPU_ARCH_ARMv6) - vecs_pgprot |= L_PTE_MT_VECTORS; - - - /* - * Check is it with support for the PXN bit - * in the Short-descriptor translation table format descriptors. - */ - if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { - user_pmd_table |= PMD_PXNTABLE; - } -#endif - /* * ARMv5 and lower, bit 4 must be set for page tables (was: cache * "update-able on write" bit on ARM610). However, Xscale and @@ -651,11 +629,6 @@ static void __init build_mem_type_table(void) } kern_pgprot |= PTE_EXT_AF; vecs_pgprot |= PTE_EXT_AF; - - /* - * Set PXN for user mappings - */ - user_pgprot |= PTE_EXT_PXN; #endif for (i = 0; i < 16; i++) { From 525107d2391a7b13f39fbf9a25e4037c49cb2717 Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Sat, 29 Nov 2014 02:33:30 +0100 Subject: [PATCH 416/420] BACKPORT: ARM: 8235/1: Support for the PXN CPU feature on ARMv7 Modern ARMv7-A/R cores optionally implement below new hardware feature: - PXN: Privileged execute-never(PXN) is a security feature. PXN bit determines whether the processor can execute software from the region. This is effective solution against ret2usr attack. On an implementation that does not include the LPAE, PXN is optionally supported. This patch set PXN bit on user page table for preventing user code execution with privilege mode. Reviewed-by: Catalin Marinas Signed-off-by: Jungseung Lee Signed-off-by: Russell King Bug: 31161206 Change-Id: I9170f2208356edd8345058397bf1aa8e91414cf0 --- arch/arm/include/asm/pgalloc.h | 10 +++++++- arch/arm/include/asm/pgtable-2level-hwdef.h | 2 ++ arch/arm/include/asm/pgtable-3level-hwdef.h | 1 + arch/arm/mm/mmu.c | 26 +++++++++++++++++++++ 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 78a779361682ad..19cfab526d13a3 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -157,7 +157,15 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { - __pmd_populate(pmdp, page_to_phys(ptep), _PAGE_USER_TABLE); + extern pmdval_t user_pmd_table; + pmdval_t prot; + + if (__LINUX_ARM_ARCH__ >= 6 && !IS_ENABLED(CONFIG_ARM_LPAE)) + prot = user_pmd_table; + else + prot = _PAGE_USER_TABLE; + + __pmd_populate(pmdp, page_to_phys(ptep), prot); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/arch/arm/include/asm/pgtable-2level-hwdef.h b/arch/arm/include/asm/pgtable-2level-hwdef.h index 5cfba15cb401e5..5e68278e953e25 100644 --- a/arch/arm/include/asm/pgtable-2level-hwdef.h +++ b/arch/arm/include/asm/pgtable-2level-hwdef.h @@ -20,12 +20,14 @@ #define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 1) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 2) << 0) +#define PMD_PXNTABLE (_AT(pmdval_t, 1) << 2) /* v7 */ #define PMD_BIT4 (_AT(pmdval_t, 1) << 4) #define PMD_DOMAIN(x) (_AT(pmdval_t, (x)) << 5) #define PMD_PROTECTION (_AT(pmdval_t, 1) << 9) /* v5 */ /* * - section */ +#define PMD_SECT_PXN (_AT(pmdval_t, 1) << 0) /* v7 */ #define PMD_SECT_BUFFERABLE (_AT(pmdval_t, 1) << 2) #define PMD_SECT_CACHEABLE (_AT(pmdval_t, 1) << 3) #define PMD_SECT_XN (_AT(pmdval_t, 1) << 4) /* v6 */ diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h index 9fd61c72a33a14..f8f1cff62065b7 100644 --- a/arch/arm/include/asm/pgtable-3level-hwdef.h +++ b/arch/arm/include/asm/pgtable-3level-hwdef.h @@ -76,6 +76,7 @@ #define PTE_EXT_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ #define PTE_EXT_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ #define PTE_EXT_NG (_AT(pteval_t, 1) << 11) /* nG */ +#define PTE_EXT_PXN (_AT(pteval_t, 1) << 53) /* PXN */ #define PTE_EXT_XN (_AT(pteval_t, 1) << 54) /* XN */ /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index a7b12cb21e816e..945aba5afb827e 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -53,6 +53,8 @@ EXPORT_SYMBOL(empty_zero_page); */ pmd_t *top_pmd; +pmdval_t user_pmd_table = _PAGE_USER_TABLE; + #define CPOLICY_UNCACHED 0 #define CPOLICY_BUFFERED 1 #define CPOLICY_WRITETHROUGH 2 @@ -562,6 +564,25 @@ static void __init build_mem_type_table(void) vecs_pgprot |= L_PTE_MT_VECTORS; #endif +#ifndef CONFIG_ARM_LPAE + /* + * We don't use domains on ARMv6 (since this causes problems with + * v6/v7 kernels), so we must use a separate memory type for user + * r/o, kernel r/w to map the vectors page. + */ + if (cpu_arch == CPU_ARCH_ARMv6) + vecs_pgprot |= L_PTE_MT_VECTORS; + + /* + * Check is it with support for the PXN bit + * in the Short-descriptor translation table format descriptors. + */ + if (cpu_arch == CPU_ARCH_ARMv7 && + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + user_pmd_table |= PMD_PXNTABLE; + } +#endif + /* * ARMv6 and above have extended page tables. */ @@ -629,6 +650,11 @@ static void __init build_mem_type_table(void) } kern_pgprot |= PTE_EXT_AF; vecs_pgprot |= PTE_EXT_AF; + + /* + * Set PXN for user mappings + */ + user_pgprot |= PTE_EXT_PXN; #endif for (i = 0; i < 16; i++) { From cf12c801a45e50c394d21634eb9e9958d74fc5f0 Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Tue, 29 Dec 2015 05:47:00 +0100 Subject: [PATCH 417/420] UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor The VMSA field of MMFR0 (bottom 4 bits) is incremented for each added feature. PXN is supported if the value is >= 4 and LPAE is supported if it is >= 5. In case a kernel with CONFIG_ARM_LPAE disabled is used on a processor that supports LPAE, we can still use PXN in short descriptors. So check for >= 4 not == 4. Signed-off-by: Jungseung Lee Acked-by: Catalin Marinas Signed-off-by: Ben Hutchings Signed-off-by: Russell King Bug: 31161206 Change-Id: If96a96e6fd0330dba6ace127ed46cbf2c38a1ea5 --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 945aba5afb827e..d687cc227b2bc6 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -578,7 +578,7 @@ static void __init build_mem_type_table(void) * in the Short-descriptor translation table format descriptors. */ if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) { user_pmd_table |= PMD_PXNTABLE; } #endif From e56614167bccc2bae542a8379603e893a2e2991c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 2 Jun 2015 17:08:29 +0200 Subject: [PATCH 418/420] UPSTREAM: audit: Fix check of return value of strnlen_user() (cherry picked from commit 0b08c5e59441d08ab4b5e72afefd5cd98a4d83df) strnlen_user() returns 0 when it hits fault, not -1. Fix the test in audit_log_single_execve_arg(). Luckily this shouldn't ever happen unless there's a kernel bug so it's mostly a cosmetic fix. CC: Paul Moore Signed-off-by: Jan Kara Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Change-Id: I0eed84bbadd8fb49f81f22922468dface90acaba --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e420a0c41b5f60..edd807bf08c386 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1044,7 +1044,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { + if (unlikely((len == 0) || len > MAX_ARG_STRLEN - 1)) { WARN_ON(1); send_sig(SIGKILL, current, 0); return -1; From d6bd90dbed71762f7abce56a9054a1f4473cda44 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jul 2015 09:33:38 -0700 Subject: [PATCH 419/420] UPSTREAM: Fix broken audit tests for exec arg len (cherry picked from commit 45820c294fe1b1a9df495d57f40585ef2d069a39) The "fix" in commit 0b08c5e5944 ("audit: Fix check of return value of strnlen_user()") didn't fix anything, it broke things. As reported by Steven Rostedt: "Yes, strnlen_user() returns 0 on fault, but if you look at what len is set to, than you would notice that on fault len would be -1" because we just subtracted one from the return value. So testing against 0 doesn't test for a fault condition, it tests against a perfectly valid empty string. Also fix up the usual braindamage wrt using WARN_ON() inside a conditional - make it part of the conditional and remove the explicit unlikely() (which is already part of the WARN_ON*() logic, exactly so that you don't have to write unreadable code. Reported-and-tested-by: Steven Rostedt Cc: Jan Kara Cc: Paul Moore Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Change-Id: I692542689180f825f1b35e7536a03d1a3278ce53 --- kernel/auditsc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index edd807bf08c386..c4a2c129644358 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1044,8 +1044,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == 0) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); + if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { send_sig(SIGKILL, current, 0); return -1; } From 5ebd28a7b5a7bdf527dcb7e1443edd3149cc3696 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Jul 2016 17:42:57 -0400 Subject: [PATCH 420/420] UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg() (cherry picked from commit 43761473c254b45883a64441dd0bc85a42f3645c) There is a double fetch problem in audit_log_single_execve_arg() where we first check the execve(2) argumnets for any "bad" characters which would require hex encoding and then re-fetch the arguments for logging in the audit record[1]. Of course this leaves a window of opportunity for an unsavory application to munge with the data. This patch reworks things by only fetching the argument data once[2] into a buffer where it is scanned and logged into the audit records(s). In addition to fixing the double fetch, this patch improves on the original code in a few other ways: better handling of large arguments which require encoding, stricter record length checking, and some performance improvements (completely unverified, but we got rid of some strlen() calls, that's got to be a good thing). As part of the development of this patch, I've also created a basic regression test for the audit-testsuite, the test can be tracked on GitHub at the following link: * https://github.com/linux-audit/audit-testsuite/issues/25 [1] If you pay careful attention, there is actually a triple fetch problem due to a strnlen_user() call at the top of the function. [2] This is a tiny white lie, we do make a call to strnlen_user() prior to fetching the argument data. I don't like it, but due to the way the audit record is structured we really have no choice unless we copy the entire argument at once (which would require a rather wasteful allocation). The good news is that with this patch the kernel no longer relies on this strnlen_user() value for anything beyond recording it in the log, we also update it with a trustworthy value whenever possible. Reported-by: Pengfei Wang Cc: Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Change-Id: I10e979e94605e3cf8d461e3e521f8f9837228aa5 Bug: 30956807 --- kernel/auditsc.c | 332 +++++++++++++++++++++++------------------------ 1 file changed, 164 insertions(+), 168 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c4a2c129644358..dbde3aafb47e1b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -72,6 +72,7 @@ #include #include #include +#include #include "audit.h" @@ -80,7 +81,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -1008,184 +1010,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab) { - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; + char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; + + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; + + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; + + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { + audit_panic("out of memory for argv string"); + return; } + buf = buf_head; - /* walk the whole argument looking for non-ascii chars */ + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } - p = (const char __user *)current->mm->arg_start; + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string"); - return; - } + /* NOTE: the caller handles the final audit_log_end() call */ - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic)